Repository: supranational/blst Branch: master Commit: f62244ef50ad Files: 229 Total size: 3.7 MB Directory structure: gitextract_z39wz_mc/ ├── .gitattributes ├── .github/ │ └── workflows/ │ ├── ci.yml │ ├── codeql-analysis.yml │ └── golang-lint.yml ├── .gitignore ├── .golangci.yml ├── .lgtm.yml ├── .travis.yml ├── LICENSE ├── README.md ├── SECURITY.md ├── bindings/ │ ├── blst.h │ ├── blst.hpp │ ├── blst.swg │ ├── blst_aux.h │ ├── c#/ │ │ ├── poc.cs │ │ ├── poc.csproj │ │ ├── run.me │ │ └── supranational.blst.cs │ ├── go/ │ │ ├── README.md │ │ ├── blst.go │ │ ├── blst.tgo │ │ ├── blst_htoc_test.go │ │ ├── blst_miller_loop_test.go │ │ ├── blst_minpk.tgo │ │ ├── blst_minpk_test.go │ │ ├── blst_minsig_test.go │ │ ├── blst_misc.tgo │ │ ├── blst_px.tgo │ │ ├── blst_wasm.go │ │ ├── cgo_assembly.S │ │ ├── cgo_server.c │ │ ├── generate.py │ │ └── rb_tree.go │ ├── rust/ │ │ ├── Cargo.toml │ │ ├── README.md │ │ ├── benches/ │ │ │ └── blst_benches.rs │ │ ├── build.rs │ │ ├── publish.sh │ │ ├── rustfmt.toml │ │ └── src/ │ │ ├── bindings.rs │ │ ├── lib.rs │ │ ├── pippenger-no_std.rs │ │ ├── pippenger-test_mod.rs │ │ └── pippenger.rs │ ├── vectors/ │ │ └── hash_to_curve/ │ │ ├── BLS12381G1_XMD_SHA-256_SSWU_NU_.json │ │ ├── BLS12381G1_XMD_SHA-256_SSWU_RO_.json │ │ ├── BLS12381G2_XMD_SHA-256_SSWU_NU_.json │ │ ├── BLS12381G2_XMD_SHA-256_SSWU_RO_.json │ │ ├── README │ │ ├── expand_message_xmd_SHA256_256.json │ │ └── expand_message_xmd_SHA256_38.json │ └── zig/ │ ├── README.md │ ├── blst.zig │ ├── c.zig │ ├── generate.py │ └── tests.zig ├── build/ │ ├── assembly.S │ ├── bindings_trim.pl │ ├── cheri/ │ │ ├── add_mod_256-armv8.S │ │ ├── add_mod_384-armv8.S │ │ ├── ct_inverse_mod_256-armv8.S │ │ ├── ct_inverse_mod_384-armv8.S │ │ ├── ct_is_square_mod_384-armv8.S │ │ ├── div3w-armv8.S │ │ ├── mul_mont_256-armv8.S │ │ ├── mul_mont_384-armv8.S │ │ └── sha256-armv8.S │ ├── coff/ │ │ ├── add_mod_256-armv8.S │ │ ├── add_mod_256-x86_64.s │ │ ├── add_mod_384-armv8.S │ │ ├── add_mod_384-x86_64.s │ │ ├── add_mod_384x384-x86_64.s │ │ ├── ct_inverse_mod_256-armv8.S │ │ ├── ct_inverse_mod_256-x86_64.s │ │ ├── ct_inverse_mod_384-armv8.S │ │ ├── ct_is_square_mod_384-armv8.S │ │ ├── ct_is_square_mod_384-x86_64.s │ │ ├── ctq_inverse_mod_384-x86_64.s │ │ ├── ctx_inverse_mod_384-x86_64.s │ │ ├── div3w-armv8.S │ │ ├── div3w-x86_64.s │ │ ├── mul_mont_256-armv8.S │ │ ├── mul_mont_384-armv8.S │ │ ├── mulq_mont_256-x86_64.s │ │ ├── mulq_mont_384-x86_64.s │ │ ├── mulx_mont_256-x86_64.s │ │ ├── mulx_mont_384-x86_64.s │ │ ├── sha256-armv8.S │ │ ├── sha256-portable-x86_64.s │ │ └── sha256-x86_64.s │ ├── elf/ │ │ ├── add_mod_256-armv8.S │ │ ├── add_mod_256-x86_64.s │ │ ├── add_mod_384-armv8.S │ │ ├── add_mod_384-x86_64.s │ │ ├── add_mod_384x384-x86_64.s │ │ ├── ct_inverse_mod_256-armv8.S │ │ ├── ct_inverse_mod_256-x86_64.s │ │ ├── ct_inverse_mod_384-armv8.S │ │ ├── ct_is_square_mod_384-armv8.S │ │ ├── ct_is_square_mod_384-x86_64.s │ │ ├── ctq_inverse_mod_384-x86_64.s │ │ ├── ctx_inverse_mod_384-x86_64.s │ │ ├── div3w-armv8.S │ │ ├── div3w-x86_64.s │ │ ├── mul_mont_256-armv8.S │ │ ├── mul_mont_384-armv8.S │ │ ├── mulq_mont_256-x86_64.s │ │ ├── mulq_mont_384-x86_64.s │ │ ├── mulx_mont_256-x86_64.s │ │ ├── mulx_mont_384-x86_64.s │ │ ├── sha256-armv8.S │ │ ├── sha256-portable-x86_64.s │ │ └── sha256-x86_64.s │ ├── mach-o/ │ │ ├── add_mod_256-armv8.S │ │ ├── add_mod_256-x86_64.s │ │ ├── add_mod_384-armv8.S │ │ ├── add_mod_384-x86_64.s │ │ ├── add_mod_384x384-x86_64.s │ │ ├── ct_inverse_mod_256-armv8.S │ │ ├── ct_inverse_mod_256-x86_64.s │ │ ├── ct_inverse_mod_384-armv8.S │ │ ├── ct_is_square_mod_384-armv8.S │ │ ├── ct_is_square_mod_384-x86_64.s │ │ ├── ctq_inverse_mod_384-x86_64.s │ │ ├── ctx_inverse_mod_384-x86_64.s │ │ ├── div3w-armv8.S │ │ ├── div3w-x86_64.s │ │ ├── mul_mont_256-armv8.S │ │ ├── mul_mont_384-armv8.S │ │ ├── mulq_mont_256-x86_64.s │ │ ├── mulq_mont_384-x86_64.s │ │ ├── mulx_mont_256-x86_64.s │ │ ├── mulx_mont_384-x86_64.s │ │ ├── sha256-armv8.S │ │ ├── sha256-portable-x86_64.s │ │ └── sha256-x86_64.s │ ├── refresh.sh │ ├── srcroot.go │ └── win64/ │ ├── add_mod_256-armv8.asm │ ├── add_mod_256-x86_64.asm │ ├── add_mod_384-armv8.asm │ ├── add_mod_384-x86_64.asm │ ├── add_mod_384x384-x86_64.asm │ ├── blst.def │ ├── ct_inverse_mod_256-armv8.asm │ ├── ct_inverse_mod_256-x86_64.asm │ ├── ct_inverse_mod_384-armv8.asm │ ├── ct_is_square_mod_384-armv8.asm │ ├── ct_is_square_mod_384-x86_64.asm │ ├── ctq_inverse_mod_384-x86_64.asm │ ├── ctx_inverse_mod_384-x86_64.asm │ ├── div3w-armv8.asm │ ├── div3w-x86_64.asm │ ├── dll.c │ ├── mul_mont_256-armv8.asm │ ├── mul_mont_384-armv8.asm │ ├── mulq_mont_256-x86_64.asm │ ├── mulq_mont_384-x86_64.asm │ ├── mulx_mont_256-x86_64.asm │ ├── mulx_mont_384-x86_64.asm │ ├── sha256-armv8.asm │ └── sha256-x86_64.asm ├── build.bat ├── build.sh ├── build.zig ├── build.zig.zon └── src/ ├── aggregate.c ├── asm/ │ ├── add_mod_256-armv8.pl │ ├── add_mod_256-x86_64.pl │ ├── add_mod_384-armv8.pl │ ├── add_mod_384-x86_64.pl │ ├── add_mod_384x384-x86_64.pl │ ├── arm-xlate.pl │ ├── ct_inverse_mod_256-armv8.pl │ ├── ct_inverse_mod_256-x86_64.pl │ ├── ct_inverse_mod_384-armv8.pl │ ├── ct_is_square_mod_384-armv8.pl │ ├── ct_is_square_mod_384-x86_64.pl │ ├── ctq_inverse_mod_384-x86_64.pl │ ├── ctx_inverse_mod_384-x86_64.pl │ ├── div3w-armv8.pl │ ├── div3w-x86_64.pl │ ├── mul_mont_256-armv8.pl │ ├── mul_mont_384-armv8.pl │ ├── mulq_mont_256-x86_64.pl │ ├── mulq_mont_384-x86_64.pl │ ├── mulx_mont_256-x86_64.pl │ ├── mulx_mont_384-x86_64.pl │ ├── sha256-armv8.pl │ ├── sha256-portable-x86_64.pl │ ├── sha256-x86_64.pl │ └── x86_64-xlate.pl ├── blst_t.hpp ├── bulk_addition.c ├── bytes.h ├── client_min_pk.c ├── client_min_sig.c ├── consts.c ├── consts.h ├── cpuid.c ├── e1.c ├── e2.c ├── ec_mult.h ├── ec_ops.h ├── errors.h ├── exp.c ├── exports.c ├── fields.h ├── fp12_tower.c ├── hash_to_field.c ├── keygen.c ├── map_to_g1.c ├── map_to_g2.c ├── multi_scalar.c ├── no_asm.h ├── pairing.c ├── pentaroot-addchain.h ├── pentaroot.c ├── point.h ├── rb_tree.c ├── recip-addchain.h ├── recip.c ├── server.c ├── sha256.h ├── sqrt-addchain.h ├── sqrt.c ├── vect.c └── vect.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.pl linguist-language=assembly *.h linguist-language=c *.tgo linguist-language=go ================================================ FILE: .github/workflows/ci.yml ================================================ name: build on: push: branches: - '**' workflow_dispatch: branches: - '**' pull_request: branches: - master jobs: rust-n-go: runs-on: ${{ matrix.os }} strategy: matrix: os: [ ubuntu-latest, ubuntu-24.04-arm, windows-latest, windows-11-arm, macos-latest ] steps: - uses: actions/checkout@v6 - name: Get date id: get-date run: echo "date=$(date -u +%Y-%m)" >> $GITHUB_OUTPUT shell: bash - uses: actions/cache@v5 with: path: | ~/.cargo/registry **/Cargo.lock **/bindings/rust/target ~/.wasmtime key: ${{ runner.os }}-${{ runner.arch }}-cargo-${{ steps.get-date.outputs.date }} - name: Environment shell: bash run: | lscpu 2>/dev/null && echo --- || true sysctl hw 2>/dev/null && echo --- || true env | sort - name: Install Wasmtime if: ${{ runner.os == 'Linux' }} shell: bash run: if [ ! -d ~/.wasmtime/bin ]; then curl https://wasmtime.dev/install.sh -sSf | bash; fi - name: Rust shell: bash run: | rustc --version --verbose export CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse cd bindings/rust sed "s/^crit/#crit/" Cargo.toml > Cargo.$$.toml && \ mv Cargo.$$.toml Cargo.toml if [ "$GITHUB_EVENT_NAME" != "pull_request" ]; then cargo update fi cargo test --release echo '--- test portable' echo cargo test --release --features=portable echo '--- test no-threads' echo cargo test --release --features=no-threads echo '--- test serde-secret' echo cargo test --release --features=serde-secret echo '--- test no_std' echo echo 'set -e' > ulimit-s echo 'export RUST_MIN_STACK=$(($1 * 1024)); shift' >> ulimit-s echo 'exec "$@"' >> ulimit-s triplet=`rustc -vV | awk '/host:/ {print $2}' | tr 'a-z-' 'A-Z_'` stack_size=`[ $RUNNER_OS = "Windows" ] && echo 65 || echo 56` env BLST_TEST_NO_STD= \ CARGO_TARGET_${triplet}_RUNNER="bash ulimit-s $stack_size" \ cargo test --release if [ -x ~/.wasmtime/bin/wasmtime ]; then echo '--- test wasm32-wasip1' echo rustup target add wasm32-wasip1 env CARGO_TARGET_WASM32_WASIP1_RUNNER=~/.wasmtime/bin/wasmtime \ cargo test --release --target=wasm32-wasip1 cargo clean -p blst --release --target=wasm32-wasip1 echo fi if [ $RUNNER_OS = "Linux" ]; then if [ `uname -p` = "x86_64" ]; then echo '--- test -mlvi-hardening' echo env CC=clang CFLAGS="-mlvi-hardening -D__SGX_LVI_HARDENING__" \ cargo test --release echo '--- build x86_64-fortanix-unknown-sgx' echo rustup target add x86_64-fortanix-unknown-sgx cargo test --no-run --release --target=x86_64-fortanix-unknown-sgx cargo clean -p blst --release --target=x86_64-fortanix-unknown-sgx echo fi echo '--- dry-run publish' echo ./publish.sh --dry-run elif [ $RUNNER_OS = "macOS" ]; then if [ $RUNNER_ARCH = "ARM64" ]; then echo '--- test x86_64-apple-darwin' echo rustup target add x86_64-apple-darwin cargo test --release --target=x86_64-apple-darwin cargo clean -p blst --release --target=x86_64-apple-darwin echo else echo '--- build aarch64-apple-darwin' echo rustup target add aarch64-apple-darwin cargo test --no-run --release --target=aarch64-apple-darwin cargo clean -p blst --release --target=aarch64-apple-darwin echo fi echo '--- build aarch64-apple-ios' echo rustup target add aarch64-apple-ios env IPHONEOS_DEPLOYMENT_TARGET=10.0 \ cargo test --no-run --release --target=aarch64-apple-ios cargo clean -p blst --release --target=aarch64-apple-ios echo elif [ $RUNNER_OS = "Windows" -a $RUNNER_ARCH = "X64" ]; then if which clang-cl > /dev/null 2>&1; then echo '-- test i686-pc-windows-msvc' echo rustup target add i686-pc-windows-msvc cargo test --release --target=i686-pc-windows-msvc cargo clean -p blst --release --target=i686-pc-windows-msvc echo fi echo '-- test x86_64-pc-windows-gnu' echo rustup target add x86_64-pc-windows-gnu cargo test --release --target=x86_64-pc-windows-gnu cargo clean -p blst --release --target=x86_64-pc-windows-gnu echo fi echo echo '--- cargo clippy' echo echo 'msrv = "1.56"' > .clippy.toml cargo clippy --release cargo clean -p blst cargo clean -p blst --release rm -rf target/.rustc_info.json rm -rf target/package rm -rf target/{debug,release}/incremental rm -rf target/*/{debug,release}/incremental rm -rf ~/.cargo/registry/src rm -rf ~/.cargo/registry/index/*/.cache mkdir -p ~/.wasmtime - name: Go if: ${{ runner.os != 'Windows' || runner.arch != 'ARM64' }} shell: bash run: | go version 2>/dev/null || exit 0 if ! (grep -q -e '^flags.*\badx\b' /proc/cpuinfo) 2>/dev/null; then export CGO_CFLAGS="-O -D__BLST_PORTABLE__" fi cd bindings/go go test -test.v misc-ubuntu-latest: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/cache@v5 with: path: ~/swig key: ${{ runner.os }}-swig-github - uses: actions/setup-java@v5 with: distribution: temurin java-version: 11 - uses: actions/setup-node@v6 with: node-version: '20.x' - name: Environment run: | lscpu echo --- env | sort - name: Python run: if [ -x bindings/python/run.me ]; then bindings/python/run.me; fi - name: Java run: if [ -x bindings/java/run.me ]; then bindings/java/run.me; fi - name: Node.js run: | node_js=bindings/node.js if [ -x $node_js/run.me ]; then if [ ! -x ~/swig/bin/swig ]; then ( git clone --branch v4.3.0 https://github.com/swig/swig; cd swig; ./autogen.sh; ./configure --prefix=$HOME/swig; make; make install; (cd ~/swig/share/swig && ln -s `ls` current) ) fi env PATH=~/swig/bin:$PATH SWIG_LIB=~/swig/share/swig/current \ $node_js/run.me fi - name: node-gyp run: | node_js=bindings/node.js if [ -f $node_js/binding.gyp -a -f $node_js/blst_wrap.cpp ]; then npm install --global node-gyp || true if which node-gyp > /dev/null 2>&1; then ( export PATH=~/swig/bin:$PATH SWIG_LIB=~/swig/share/swig/current; cd $node_js; node-gyp configure; node-gyp build; env NODE_PATH=build/Release: node runnable.js; ) fi fi - name: TypeScript run: | node_js=bindings/node.js if [ -f $node_js/blst.hpp.ts -a -f $node_js/blst.node ]; then npm install --global typescript || true if which tsc > /dev/null 2>&1; then ( cd $node_js; npm install @types/node; tsc runnable.ts --ignoreConfig --types node --module commonjs; env NODE_PATH=.: node runnable.js; ) fi fi - name: Emscripten uses: docker://emscripten/emsdk with: args: > bindings/emscripten/run.me -O2 - name: C# run: | if [ -x bindings/c#/run.me ]; then bindings/c#/run.me; if which dotnet > /dev/null 2>&1; then cd bindings/c# [ -f libblst.dll.so ] || ../../build.sh -dll dotnet run -c Release fi fi - uses: actions/cache@v5 with: path: | ~/.cache/zig ~/zig-x86_64-linux-* ~/.wasmtime key: ${{ runner.os }}-zig-github - name: Zig run: | ver=0.15.2 base_dir=zig-x86_64-linux-$ver if [ ! -d ~/$base_dir ]; then curl -sSf https://ziglang.org/download/$ver/$base_dir.tar.xz | unxz -c | tar xf - --directory ~ fi if [ -x ~/$base_dir/zig ]; then PATH=~/$base_dir:$PATH zig build test --summary new echo '--- test wasm32-wasi' if [ ! -d ~/.wasmtime ]; then curl https://wasmtime.dev/install.sh -sSf | bash fi PATH=~/.wasmtime/bin:$PATH zig build test -Dtarget=wasm32-wasi -fwasmtime --summary new fi ================================================ FILE: .github/workflows/codeql-analysis.yml ================================================ name: "CodeQL" on: push: branches: - '**' paths: - 'src/*' - 'bindings/c#/*' - '.github/workflows/codeql-analysis.yml' pull_request: branches: - master paths: - 'src/*' - 'bindings/c#/*' #schedule: # - cron: '0 23+ * * 4' jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: security-events: write strategy: fail-fast: false matrix: language: [ 'cpp', 'csharp' ] steps: - name: Checkout repository uses: actions/checkout@v6 with: # We must fetch at least the immediate parents so that if this is # a pull request then we can checkout the head. fetch-depth: 2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} queries: security-extended - if: matrix.language == 'cpp' name: Custom build run: ./build.sh -m32 -ffreestanding - if: matrix.language != 'cpp' name: Autobuild uses: github/codeql-action/autobuild@v4 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4 ================================================ FILE: .github/workflows/golang-lint.yml ================================================ name: golang-lint on: push: branches: - '**' paths: - 'bindings/go/*.go' - '.github/workflows/golang-lint.yml' - '.golangci.yml' pull_request: branches: - master paths: - 'bindings/go/*.go' jobs: golang-lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: go-version: '>=1.21' cache: false - name: "go version" run: go version - uses: golangci/golangci-lint-action@v9 with: # Require: The version of golangci-lint to use. # When `install-mode` is `binary` (default) the value can be v1.2 or v1.2.3 or `latest` to use the latest version. # When `install-mode` is `goinstall` the value can be v1.2.3, `latest`, or the hash of a commit. version: v2.9 # Optional: working directory, useful for monorepos # working-directory: somedir # Optional: golangci-lint command line arguments. # # Note: By default, the `.golangci.yml` file should be at the root of the repository. # The location of the configuration file can be changed by using `--config=` # args: --timeout=30m --config=/my/path/.golangci.yml --issues-exit-code=0 # Optional: show only new issues if it's a pull request. The default value is `false`. # only-new-issues: true # Optional: if set to true, then all caching functionality will be completely disabled, # takes precedence over all other caching options. skip-cache: true # Optional: if set to true, then the action won't cache or restore ~/go/pkg. # skip-pkg-cache: true # Optional: if set to true, then the action won't cache or restore ~/.cache/go-build. # skip-build-cache: true # Optional: The mode to install golangci-lint. It can be 'binary' or 'goinstall'. # install-mode: "goinstall" ================================================ FILE: .gitignore ================================================ # Prerequisites *.d # Object files *.o *.ko *.obj *.elf # Linker output *.ilk *.map *.exp # Precompiled Headers *.gch *.pch # Libraries *.lib *.a *.la *.lo # Shared objects (inc. Windows DLLs) *.dll *.so *.so.* *.dylib # Executables *.exe *.out *.app *.i*86 *.x86_64 *.hex # Debug files *.dSYM/ *.su *.idb *.pdb # Kernel Module Compile Results *.mod* *.cmd .tmp_versions/ modules.order Module.symvers Mkfile.old dkms.conf # Open swap files *.swp # Emacs backup files *~ # Rust build Cargo.lock bindings/rust/target bindings/rust/blst # These are customarily filled with swig artefacts bindings/python bindings/java bindings/node.js bindings/emscripten bin/ obj/ zig-out .zig-cache ================================================ FILE: .golangci.yml ================================================ version: "2" linters: default: all disable: # just whining - copyloopvar # go>=1.22 - cyclop - dupword - forbidigo - funlen - gochecknoglobals - gochecknoinits - gocognit - gocritic - gocyclo - godot - intrange # go>=1.22 - lll - mnd - nestif - nlreturn - varnamelen - whitespace - wsl - wsl_v5 # auto-generation artefact - dupl # maybe some day... - godoclint - godox - maintidx # maybe some day in tests... - forcetypeassert - nonamedreturns - perfsprint - testpackage # 83 active linters remaining including gosec, gosimple, govet, etc. settings: revive: enable-all-rules: true rules: - name: add-constant disabled: true - name: argument-limit disabled: true - name: cognitive-complexity # similar to 'gocognit' above disabled: true - name: cyclomatic # similar to 'cyclop' & 'gocyclo' above disabled: true - name: empty-block disabled: true - name: empty-lines disabled: true - name: flag-parameter disabled: true - name: function-length # similar to 'funlen' above disabled: true - name: function-result-limit disabled: true - name: increment-decrement disabled: true - name: line-length-limit # similar to 'lll' above disabled: true - name: max-public-structs disabled: true - name: package-directory-mismatch disabled: true - name: receiver-naming disabled: true - name: var-naming disabled: true - name: unchecked-type-assertion # similar to 'forcetypeassert' above disabled: true - name: unexported-naming disabled: true - name: unhandled-error arguments: - fmt.Println - fmt.Printf - name: use-any # applicable to go>=1.18 only disabled: true exclusions: generated: lax presets: - comments - common-false-positives - legacy - std-error-handling paths: - third_party$ - builtin$ - examples$ formatters: exclusions: generated: lax paths: - third_party$ - builtin$ - examples$ ================================================ FILE: .lgtm.yml ================================================ queries: - include: "*" - exclude: cpp/unused-static-function - exclude: cpp/include-non-header - exclude: cs/call-to-unmanaged-code - exclude: cs/unmanaged-code extraction: cpp: index: build_command: - ./build.sh -m32 go: index: build_command: - (cd bindings/go; go test -c) csharp: index: nuget_restore: false ================================================ FILE: .travis.yml ================================================ branches: only: - /.*/ language: rust git: quiet: true os: - linux arch: - arm64 - s390x before_script: - lscpu 2>/dev/null && echo --- || true - env | sort script: - if [ "$TRAVIS_LANGUAGE" = "rust" ]; then if [ "$TRAVIS_OS_NAME" = "windows" ]; then rustup set default-host x86_64-pc-windows-msvc; export ML=-nologo; fi; ( cd bindings/rust; if [ -f target/Cargo.lock ]; then mv -f target/Cargo.lock .; fi; NOW=`date +%s`; REF=.cargo/registry/index/*/.last-updated; THEN=`(stat -c %Y "$TRAVIS_HOME"/$REF || stat -f %m "$TRAVIS_HOME"/$REF) 2>/dev/null`; if [ $(($NOW - ${THEN:-0})) -gt 604800 ]; then env CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse cargo update; fi; cargo test --release ) fi - if which go > /dev/null 2>&1; then go version; if ! (grep -q -e '^flags.*\badx\b' /proc/cpuinfo) 2>/dev/null; then export CGO_CFLAGS="-O -D__BLST_PORTABLE__"; fi; (cd bindings/go; go test -test.v) fi matrix: include: - os: linux arch: arm64 language: go notifications: email: false before_cache: - if [ "$TRAVIS_LANGUAGE" = "rust" ]; then ( cd bindings/rust; cargo clean -p blst; cargo clean -p blst --release; rm -rf target/.rustc_info.json; rm -rf target/{debug,release}/incremental; mv -f Cargo.lock target ) fi - (cd "$TRAVIS_HOME"; rm -rf .cargo/registry/src) - (cd "$TRAVIS_HOME"; rm -rf .cargo/registry/index/*/.cache) cache: cargo: true directories: - bindings/rust/target ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ [![Actions status](https://github.com/supranational/blst/workflows/build/badge.svg)](https://github.com/supranational/blst/actions) [![CodeQL status](https://github.com/supranational/blst/workflows/CodeQL/badge.svg)](https://github.com/supranational/blst/actions/workflows/codeql-analysis.yml)
# blst blst (pronounced 'blast') is a BLS12-381 signature library focused on performance and security. It is written in C and assembly. ## Table of Contents * [Status](#status) * [General notes on implementation](#general-notes-on-implementation) * [Platform and Language Compatibility](#platform-and-language-compatibility) * [API](#api) * [Introductory Tutorial](#introductory-tutorial) + [Public Keys and Signatures](#public-keys-and-signatures) + [Signature Verification](#signature-verification) + [Signature Aggregation](#signature-aggregation) + [Serialization Format](#serialization-format) * [Build](#build) + [C static library](#c-static-library) * [Language-specific notes](#language-specific-notes) + [Go](#go) + [Rust](#rust) * [Repository Structure](#repository-structure) * [Performance](#performance) * [License](#license) ## Status **This library is under active development** An initial audit of this library was conducted by NCC Group in January 2021 and can be found [here](https://research.nccgroup.com/wp-content/uploads/2021/01/NCC_Group_EthereumFoundation_ETHF002_Report_2021-01-20_v1.0.pdf). Formal verification of this library by Galois is on-going and can be found [here](https://github.com/GaloisInc/BLST-Verification). This library is compliant with the following IETF draft specifications: - [IETF BLS Signature V6](https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature) - [IETF RFC 9380 Hashing to Elliptic Curves](https://www.rfc-editor.org/rfc/rfc9380.html) The serialization formatting is implemented according to [the ZCash definition](#serialization-format). ## General notes on implementation The goal of the blst library is to provide a foundational component for applications and other libraries that require high performance and formally verified BLS12-381 operations. With that in mind some decisions are made to maximize the public good beyond BLS12-381. For example, the field operations are optimized for general 384-bit usage, as opposed to tuned specifically for the 381-bit BLS12-381 curve parameters. With the formal verification of these foundational components, we believe they can provide a reliable building block for other curves that would like high performance and an extra element of security. The library deliberately abstains from dealing with memory management and multi-threading, with the rationale that these ultimately belong in language-specific bindings. Another responsibility that is left to application is random number generation. All this in the name of run-time neutrality, which makes integration into more stringent environments like Intel SGX or ARM TrustZone trivial. ## Platform and Language Compatibility This library primarily supports x86_64 and ARM64 hardware platforms, and Linux, Mac, and Windows operating systems. But it does have a portable replacement for the assembly modules, which can be compiled for a plethora of other platforms. Problem reports for these will be considered and are likely to be addressed. This repository includes explicit bindings for: - [Go](bindings/go) - [Rust](bindings/rust) Unless deemed appropriate to implement, bindings for other languages will be provided using [SWIG](http://swig.org). Proof-of-concept scripts are available for: - [Python](bindings/python) - [Java](bindings/java) - [Node.js](bindings/node.js) - [Emscripten](bindings/emscripten) - [C#](bindings/c%23) - [Zig](bindings/zig) ## API The blst API is defined in the C header [bindings/blst.h](bindings/blst.h). The API can be categorized as follows, with some example operations: - Field Operations (add, sub, mul, neg, inv, to/from Montgomery) - Curve Operations (add, double, mul, to/from affine, group check) - Intermediate (hash to curve, pairing, serdes) - BLS12-381 signature (sign, verify, aggregate) Note: there is also an auxiliary header file, [bindings/blst_aux.h](bindings/blst_aux.h), that is used as a staging area for experimental interfaces that may or may not get promoted to blst.h. ## Introductory Tutorial Programming is understanding, and understanding implies mastering the lingo. So we have a pair of additive groups being mapped to multiplicative one... What does it mean? Well, this tutorial is not about explaining that, but rather about making the connection between what you're supposed to know about [pairing-based cryptography](https://en.wikipedia.org/wiki/Pairing-based_cryptography) and the interface provided by the library. ### Public Keys and Signatures We have two elliptic curves, E1 and E2, points on which are contained in `blst_p1` and `blst_p2`, or `blst_p1_affine` and `blst_p2_affine` structures. Elements in the multiplicative group are held in a `blst_fp12` structure. One of the curves, or more specifically, a subset of points that form a cyclic group, is chosen for public keys, and another, for signatures. The choice is denoted by the subroutines' suffixes, `_pk_in_g1` or `_pk_in_g2`. The most common choice appears to be the former, that is, `blst_p1` for public keys, and `blst_p2` for signatures. But it all starts with a secret key... The secret key is held in a 256-bit `blst_scalar` structure which can be instantiated with either [`blst_keygen`](https://tools.ietf.org/html/draft-irtf-cfrg-bls-signature#section-2.3), or deserialized with `blst_scalar_from_bendian` or `blst_scalar_from_lendian` from a previously serialized byte sequence. It shouldn't come as surprise that there are two uses for a secret key: - generating the associated public key, either with `blst_sk_to_pk_in_g1` or `blst_sk_to_pk_in_g2`; - performing a sign operation, either with `blst_sign_pk_in_g1` or `blst_sign_pk_in_g2`; As for signing, unlike what your intuition might suggest, `blst_sign_*` doesn't sign a message, but rather a point on the corresponding elliptic curve. You can obtain this point from a message by calling `blst_hash_to_g2` or `blst_encode_to_g2` (see the [IETF hash-to-curve](https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve#section-3) draft for distinction). Another counter-intuitive aspect is the apparent g1 vs. g2 naming mismatch, in the sense that `blst_sign_pk_in_g1` accepts output from `blst_hash_to_g2`, and `blst_sign_pk_in_g2` accepts output from `blst_hash_to_g1`. This is because, as you should recall, public keys and signatures come from complementary groups. Now that you have a public key and signature, as points on corresponding elliptic curves, you can serialize them with `blst_p1_serialize`/`blst_p1_compress` and `blst_p2_serialize`/`blst_p2_compress` and send the resulting byte sequences over the network for deserialization/uncompression and verification. ### Signature Verification Even though there are "single-shot" `blst_core_verify_pk_in_g1` and `blst_core_verify_pk_in_g2`, you should really familiarize yourself with the more generalized pairing interface. `blst_pairing` is an opaque structure, and the only thing you know about it is `blst_pairing_sizeof`, which is how much memory you're supposed to allocate for it. In order to verify an aggregated signature for a set of public keys and messages, or just one[!], you would: ``` blst_pairing_init(ctx, hash_or_encode, domain_separation_tag); blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, message[0]); blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, message[1]); ... blst_pairing_commit(ctx); result = blst_pairing_finalverify(ctx, NULL); ``` **The essential point to note** is that it's the caller's responsibility to ensure that public keys are group-checked with `blst_p1_affine_in_g1`. This is because it's a relatively expensive operation and it's naturally assumed that the application would cache the check's outcome. Signatures are group-checked internally. Not shown in the pseudo-code snippet above, but `aggregate` and `commit` calls return `BLST_ERROR` denoting success or failure in performing the operation. Call to `finalverify`, on the other hand, returns boolean. Another, potentially more useful usage pattern is: ``` blst_p2_affine_in_g2(signature); blst_aggregated_in_g2(gtsig, signature); blst_pairing_init(ctx, hash_or_encode, domain_separation_tag); blst_pairing_aggregate_pk_in_g1(ctx, PK[0], NULL, message[0]); blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, message[1]); ... blst_pairing_commit(ctx); result = blst_pairing_finalverify(ctx, gtsig); ``` What is useful about it is that `aggregated_signature` can be handled in a separate thread. And while we are at it, aggregate calls can also be executed in different threads. This naturally implies that each thread will operate on its own `blst_pairing` context, which will have to be combined with `blst_pairing_merge` as threads join. ### Signature Aggregation Aggregation is a trivial operation of performing point additions, with `blst_p2_add_or_double_affine` or `blst_p1_add_or_double_affine`. Note that the accumulator is a non-affine point. --- That's about what you need to know to get started with nitty-gritty of actual function declarations. ### Serialization Format From the ZCash BLS12-381 specification * Fq elements are encoded in big-endian form. They occupy 48 bytes in this form. * Fq2 elements are encoded in big-endian form, meaning that the Fq2 element c0 + c1 * u is represented by the Fq element c1 followed by the Fq element c0. This means Fq2 elements occupy 96 bytes in this form. * The group G1 uses Fq elements for coordinates. The group G2 uses Fq2 elements for coordinates. * G1 and G2 elements can be encoded in uncompressed form (the x-coordinate followed by the y-coordinate) or in compressed form (just the x-coordinate). G1 elements occupy 96 bytes in uncompressed form, and 48 bytes in compressed form. G2 elements occupy 192 bytes in uncompressed form, and 96 bytes in compressed form. The most-significant three bits of a G1 or G2 encoding should be masked away before the coordinate(s) are interpreted. These bits are used to unambiguously represent the underlying element: * The most significant bit, when set, indicates that the point is in compressed form. Otherwise, the point is in uncompressed form. * The second-most significant bit indicates that the point is at infinity. If this bit is set, the remaining bits of the group element's encoding should be set to zero. * The third-most significant bit is set if (and only if) this point is in compressed form _and_ it is not the point at infinity _and_ its y-coordinate is the lexicographically largest of the two associated with the encoded x-coordinate. ## Build The build process is very simple and only requires a C compiler. It's integrated into the Go and Rust ecosystems, so that respective users would go about as they would with any other external module. Otherwise, a binary library would have to be compiled. ### C static library A static library called libblst.a can be built in the current working directory of the user's choice: Linux, Mac, and Windows (in MinGW or Cygwin environments) ``` /some/where/build.sh ``` Windows (Visual C) ``` \some\where\build.bat ``` If final application crashes with an "illegal instruction" exception [after copying to another system], pass `-D__BLST_PORTABLE__` on `build.sh` command line. If you don't use build.sh, complement the `CFLAGS` environment variable with the said command line option. If you compile a Go application, you will need to modify the `CGO_CFLAGS` variable instead. And if you compile a Rust application, you can pass `--features portable` to `cargo build`. Alternatively, if you compile on an older Intel system, but will execute application on a newer one, consider instead passing `--features force-adx` for better performance. ## Language-specific notes ### [Go](bindings/go) There are two primary modes of operation that can be chosen based on type definitions in the application. For minimal-pubkey-size operations: ``` type PublicKey = blst.P1Affine type Signature = blst.P2Affine type AggregateSignature = blst.P2Aggregate type AggregatePublicKey = blst.P1Aggregate ``` For minimal-signature-size operations: ``` type PublicKey = blst.P2Affine type Signature = blst.P1Affine type AggregateSignature = blst.P1Aggregate type AggregatePublicKey = blst.P2Aggregate ``` For more details see the Go binding [readme](bindings/go/README.md). ### [Rust](bindings/rust) [`blst`](https://crates.io/crates/blst) is the Rust binding crate. To use min-pk version: ``` use blst::min_pk::*; ``` To use min-sig version: ``` use blst::min_sig::*; ``` For more details see the Rust binding [readme](bindings/rust/README.md). ## Repository Structure **Root** - Contains various configuration files, documentation, licensing, and a build script * **Bindings** - Contains the files that define the blst interface * blst.h - provides C API to blst library * blst_aux.h - contains experimental functions not yet committed for long-term maintenance * blst.hpp - provides foundational class-oriented C++ interface to blst library * blst.swg - provides SWIG definitions for creating blst bindings for other languages, such as Java and Python * **C#** - folder containing C# bindings and an example of how to use them * **Emscripten** - folder containing an example of how to use Emscripten WebAssembly bindings from Javascript * **Go** - folder containing Go bindings for blst, including tests and benchmarks * **Java** - folder containing an example of how to use SWIG Java bindings for blst * **Node.js** - folder containing an example of how to use SWIG Javascript bindings for blst * **Python** - folder containing an example of how to use SWIG Python bindings for blst * **Rust** - folder containing Rust bindings for blst, including tests and benchmarks * **Vectors** * **Hash_to_curve**: folder containing test for hash_to_curve from IETF specification * **Src** - folder containing C code for lower level blst functions such as field operations, extension field operations, hash-to-field, and more * **Asm** - folder containing Perl scripts that are used to generate assembly code for different hardware platforms including x86 with ADX instructions, x86 without ADX instructions, and ARMv8, and [ABI](https://en.wikipedia.org/wiki/Application_binary_interface)[1] * **Build** - this folder containing a set of pre-generated assembly files for a variety of operating systems and maintenance scripts. * **Cheri** - assembly code for use on [CHERI](https://www.cl.cam.ac.uk/research/security/ctsrd/cheri/) platforms * **Coff** - assembly code for use on Windows systems with GNU or LLVM toolchain * **Elf** - assembly code for use on Unix systems * **Mach-o** - assembly code for use on Apple operating systems * **Win64** - assembly code for use on Windows systems with Microsoft toolchain [1]: See [refresh.sh](build/refresh.sh) for usage. This method allows for simple reuse of optimized assembly across various platforms with minimal effort. ## Performance Currently both the [Go](bindings/go) and [Rust](bindings/rust) bindings provide benchmarks for a variety of signature related operations. ## License The blst library is licensed under the [Apache License Version 2.0](LICENSE) software license. ================================================ FILE: SECURITY.md ================================================ # Security Policy ## Reporting a Vulnerability To report security issues please send an e-mail to hello@supranational.net. For sensitive information or critical issues, please contact the above e-mail address with 'CRITICAL' in the subject line and we will respond with a mechanism to securely communicate. Please try to provide a clear description of any issue reported, along with how to reproduce the issue if possible. ================================================ FILE: bindings/blst.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLST_H__ #define __BLST_H__ #ifdef __SIZE_TYPE__ typedef __SIZE_TYPE__ size_t; #else #include #endif #if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ && defined(__UINT64_TYPE__) typedef __UINT8_TYPE__ uint8_t; typedef __UINT32_TYPE__ uint32_t; typedef __UINT64_TYPE__ uint64_t; #else #include #endif #ifdef __cplusplus extern "C" { #elif !defined(__STDC_VERSION__) || __STDC_VERSION__<202311 # if defined(__BLST_CGO__) typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ # elif defined(__BLST_RUST_BINDGEN__) || defined(__BLST_ZIG__) # define bool _Bool # elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 # include # elif !defined(bool) # define bool int # define __blst_h_bool__ # endif #endif #ifdef SWIG # define DEFNULL =NULL #elif defined __cplusplus # define DEFNULL =0 #else # define DEFNULL #endif typedef enum { BLST_SUCCESS = 0, BLST_BAD_ENCODING, BLST_POINT_NOT_ON_CURVE, BLST_POINT_NOT_IN_GROUP, BLST_AGGR_TYPE_MISMATCH, BLST_VERIFY_FAIL, BLST_PK_IS_INFINITY, BLST_BAD_SCALAR, } BLST_ERROR; typedef uint8_t byte; typedef uint64_t limb_t; typedef struct { byte b[256/8]; } blst_scalar; typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; /* 0 is "real" part, 1 is "imaginary" */ typedef struct { blst_fp fp[2]; } blst_fp2; typedef struct { blst_fp2 fp2[3]; } blst_fp6; typedef struct { blst_fp6 fp6[2]; } blst_fp12; void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); bool blst_scalar_fr_check(const blst_scalar *a); bool blst_sk_check(const blst_scalar *a); bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, const blst_scalar *b); bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, const blst_scalar *b); bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, const blst_scalar *b); void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); #ifndef SWIG /* * BLS12-381-specific Fr operations. */ void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); void blst_fr_sqr(blst_fr *ret, const blst_fr *a); void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); void blst_fr_inverse(blst_fr *ret, const blst_fr *a); void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); /* * BLS12-381-specific Fp operations. */ void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); void blst_fp_sqr(blst_fp *ret, const blst_fp *a); void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); void blst_fp_inverse(blst_fp *ret, const blst_fp *a); bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); void blst_bendian_from_fp(byte ret[48], const blst_fp *a); void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); void blst_lendian_from_fp(byte ret[48], const blst_fp *a); /* * BLS12-381-specific Fp2 operations. */ void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); /* * BLS12-381-specific Fp12 operations. */ void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, const blst_fp6 *xy00z0); void blst_fp12_conjugate(blst_fp12 *a); void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); /* caveat lector! |n| has to be non-zero and not more than 3! */ void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); bool blst_fp12_is_one(const blst_fp12 *a); bool blst_fp12_in_group(const blst_fp12 *a); const blst_fp12 *blst_fp12_one(void); #endif // SWIG /* * BLS12-381-specific point operations. */ typedef struct { blst_fp x, y, z; } blst_p1; typedef struct { blst_fp x, y; } blst_p1_affine; void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, const blst_p1_affine *b); void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, const blst_p1_affine *b); void blst_p1_double(blst_p1 *out, const blst_p1 *a); void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, size_t nbits); void blst_p1_cneg(blst_p1 *p, bool cbit); void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); bool blst_p1_on_curve(const blst_p1 *p); bool blst_p1_in_g1(const blst_p1 *p); bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); bool blst_p1_is_inf(const blst_p1 *a); const blst_p1 *blst_p1_generator(void); bool blst_p1_affine_on_curve(const blst_p1_affine *p); bool blst_p1_affine_in_g1(const blst_p1_affine *p); bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); bool blst_p1_affine_is_inf(const blst_p1_affine *a); const blst_p1_affine *blst_p1_affine_generator(void); typedef struct { blst_fp2 x, y, z; } blst_p2; typedef struct { blst_fp2 x, y; } blst_p2_affine; void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, const blst_p2_affine *b); void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, const blst_p2_affine *b); void blst_p2_double(blst_p2 *out, const blst_p2 *a); void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, size_t nbits); void blst_p2_cneg(blst_p2 *p, bool cbit); void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); bool blst_p2_on_curve(const blst_p2 *p); bool blst_p2_in_g2(const blst_p2 *p); bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); bool blst_p2_is_inf(const blst_p2 *a); const blst_p2 *blst_p2_generator(void); bool blst_p2_affine_on_curve(const blst_p2_affine *p); bool blst_p2_affine_in_g2(const blst_p2_affine *p); bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); bool blst_p2_affine_is_inf(const blst_p2_affine *a); const blst_p2_affine *blst_p2_affine_generator(void); /* * Multi-scalar multiplications and other multi-point operations. */ void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], size_t npoints); void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], size_t npoints); size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, const blst_p1_affine *const points[], size_t npoints); size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], size_t wbits, size_t npoints, const byte *const scalars[], size_t nbits, limb_t *scratch); size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], size_t npoints, const byte *const scalars[], size_t nbits, limb_t *scratch); void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], size_t npoints, const byte *const scalars[], size_t nbits, limb_t *scratch, size_t bit0, size_t window); void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], size_t npoints); void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], size_t npoints); size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, const blst_p2_affine *const points[], size_t npoints); size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], size_t wbits, size_t npoints, const byte *const scalars[], size_t nbits, limb_t *scratch); size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], size_t npoints, const byte *const scalars[], size_t nbits, limb_t *scratch); void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], size_t npoints, const byte *const scalars[], size_t nbits, limb_t *scratch, size_t bit0, size_t window); /* * Hash-to-curve operations. */ #ifndef SWIG void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); #endif void blst_encode_to_g1(blst_p1 *out, const byte *msg, size_t msg_len, const byte *DST DEFNULL, size_t DST_len DEFNULL, const byte *aug DEFNULL, size_t aug_len DEFNULL); void blst_hash_to_g1(blst_p1 *out, const byte *msg, size_t msg_len, const byte *DST DEFNULL, size_t DST_len DEFNULL, const byte *aug DEFNULL, size_t aug_len DEFNULL); void blst_encode_to_g2(blst_p2 *out, const byte *msg, size_t msg_len, const byte *DST DEFNULL, size_t DST_len DEFNULL, const byte *aug DEFNULL, size_t aug_len DEFNULL); void blst_hash_to_g2(blst_p2 *out, const byte *msg, size_t msg_len, const byte *DST DEFNULL, size_t DST_len DEFNULL, const byte *aug DEFNULL, size_t aug_len DEFNULL); /* * Zcash-compatible serialization/deserialization. */ void blst_p1_serialize(byte out[96], const blst_p1 *in); void blst_p1_compress(byte out[48], const blst_p1 *in); void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); void blst_p2_serialize(byte out[192], const blst_p2 *in); void blst_p2_compress(byte out[96], const blst_p2 *in); void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); /* * Specification defines two variants, 'minimal-signature-size' and * 'minimal-pubkey-size'. To unify appearance we choose to distinguish * them by suffix referring to the public key type, more specifically * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to * 'minimal-signature-size'. It might appear a bit counterintuitive * in sign call, but no matter how you twist it, something is bound to * turn a little odd. */ /* * Secret-key operations. */ void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, const byte *info DEFNULL, size_t info_len DEFNULL); void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, const blst_scalar *SK); void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, const blst_scalar *SK); /* * Pairing interface. */ #ifndef SWIG void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, const blst_p1_affine *P); void blst_miller_loop_n(blst_fp12 *ret, const blst_p2_affine *const Qs[], const blst_p1_affine *const Ps[], size_t n); void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], const blst_p1_affine *P); bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); #endif #ifdef __BLST_CGO__ typedef limb_t blst_pairing; #elif defined(__BLST_RUST_BINDGEN__) typedef struct {} blst_pairing; #else typedef struct blst_opaque blst_pairing; #endif size_t blst_pairing_sizeof(void); void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, const byte *DST DEFNULL, size_t DST_len DEFNULL); const byte *blst_pairing_get_dst(const blst_pairing *ctx); void blst_pairing_commit(blst_pairing *ctx); BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, const blst_p2_affine *PK, const blst_p1_affine *signature, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, const blst_p2_affine *PK, bool pk_grpchk, const blst_p1_affine *signature, bool sig_grpchk, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, const blst_p2_affine *PK, const blst_p1_affine *sig, const byte *scalar, size_t nbits, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, const blst_p2_affine *PK, bool pk_grpchk, const blst_p1_affine *sig, bool sig_grpchk, const byte *scalar, size_t nbits, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, const blst_p1_affine *PK, const blst_p2_affine *signature, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, const blst_p1_affine *PK, bool pk_grpchk, const blst_p2_affine *signature, bool sig_grpchk, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, const blst_p1_affine *PK, const blst_p2_affine *sig, const byte *scalar, size_t nbits, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, const blst_p1_affine *PK, bool pk_grpchk, const blst_p2_affine *sig, bool sig_grpchk, const byte *scalar, size_t nbits, const byte *msg, size_t msg_len, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); bool blst_pairing_finalverify(const blst_pairing *ctx, const blst_fp12 *gtsig DEFNULL); /* * Customarily applications aggregate signatures separately. * In which case application would have to pass NULLs for |signature| * to blst_pairing_aggregate calls and pass aggregated signature * collected with these calls to blst_pairing_finalverify. Inputs are * Zcash-compatible "straight-from-wire" byte vectors, compressed or * not. */ BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, const byte *zwire); BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, const byte *zwire); void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); /* * "One-shot" CoreVerify entry points. */ BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, const blst_p2_affine *signature, bool hash_or_encode, const byte *msg, size_t msg_len, const byte *DST DEFNULL, size_t DST_len DEFNULL, const byte *aug DEFNULL, size_t aug_len DEFNULL); BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, const blst_p1_affine *signature, bool hash_or_encode, const byte *msg, size_t msg_len, const byte *DST DEFNULL, size_t DST_len DEFNULL, const byte *aug DEFNULL, size_t aug_len DEFNULL); extern const blst_p1_affine BLS12_381_G1; extern const blst_p1_affine BLS12_381_NEG_G1; extern const blst_p2_affine BLS12_381_G2; extern const blst_p2_affine BLS12_381_NEG_G2; #include "blst_aux.h" #ifdef __cplusplus } #elif defined(__blst_h_bool__) # undef __blst_h_bool__ # undef bool #endif #endif ================================================ FILE: bindings/blst.hpp ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLST_HPP__ #define __BLST_HPP__ #if !defined(SWIG) && __cplusplus < 201103L \ && (!defined(_MSVC_LANG) || _MSVC_LANG < 201103L) # error "C++11 or later is required to compile /bindings/blst.hpp" #endif #include #include #include #include namespace blst { #ifdef __clang__ # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wextern-c-compat" #endif #include "blst.h" #ifdef __clang__ # pragma GCC diagnostic pop #endif struct bytes_t { const byte* ptr; size_t len; bytes_t() = default; bytes_t(const byte* p, size_t l) : ptr{p}, len{l} {} template class C, typename T> bytes_t(const C& c) { static_assert(sizeof(T) == 1, "unsupported type"); ptr = reinterpret_cast(c.data()); len = c.size(); } template class C, typename T, size_t N> bytes_t(const C& c) { static_assert(sizeof(T) == 1, "unsupported type"); ptr = reinterpret_cast(c.data()); len = c.size(); } }; class P1_Affine; class P1; class P2_Affine; class P2; class Pairing; inline const byte *C_bytes(const void *ptr) { return static_cast(ptr); } /* * As for SecretKey being struct and not class, and lack of constructors * with one accepting for example |IKM|. We can't make assumptions about * application's policy toward handling secret key material. Hence it's * argued that application is entitled for transparent structure, not * opaque or semi-opaque class. And in the context it's appropriate not * to "entice" developers with idiomatic constructors:-) Though this * doesn't really apply to SWIG-assisted interfaces... */ struct SecretKey { #ifdef SWIG private: #endif blst_scalar key; #ifdef SWIG public: #endif #ifndef SWIG void keygen(const byte* IKM, size_t IKM_len, const std::string& info = "") { blst_keygen(&key, IKM, IKM_len, C_bytes(info.data()), info.size()); } void keygen_v3(const byte* IKM, size_t IKM_len, const std::string& info = "") { blst_keygen_v3(&key, IKM, IKM_len, C_bytes(info.data()), info.size()); } void keygen_v4_5(const byte* IKM, size_t IKM_len, const byte* salt, size_t salt_len, const std::string& info = "") { blst_keygen_v4_5(&key, IKM, IKM_len, salt, salt_len, C_bytes(info.data()), info.size()); } void keygen_v5(const byte* IKM, size_t IKM_len, const byte* salt, size_t salt_len, const std::string& info = "") { blst_keygen_v5(&key, IKM, IKM_len, salt, salt_len, C_bytes(info.data()), info.size()); } #endif void keygen(bytes_t IKM, const std::string& info = "") { keygen(IKM.ptr, IKM.len, info); } void keygen_v3(bytes_t IKM, const std::string& info = "") { keygen_v3(IKM.ptr, IKM.len, info); } void keygen_v4_5(bytes_t IKM, bytes_t salt, const std::string& info = "") { keygen_v4_5(IKM.ptr, IKM.len, salt.ptr, salt.len, info); } void keygen_v5(bytes_t IKM, bytes_t salt, const std::string& info = "") { keygen_v5(IKM.ptr, IKM.len, salt.ptr, salt.len, info); } void derive_master_eip2333(const byte* IKM, size_t IKM_len) { blst_derive_master_eip2333(&key, IKM, IKM_len); } void derive_child_eip2333(const SecretKey& SK, unsigned int child_index) { blst_derive_child_eip2333(&key, &SK.key, child_index); } void from_bendian(const byte in[32]) { blst_scalar_from_bendian(&key, in); } void from_lendian(const byte in[32]) { blst_scalar_from_lendian(&key, in); } void to_bendian(byte out[32]) const { blst_bendian_from_scalar(out, &key); } void to_lendian(byte out[32]) const { blst_lendian_from_scalar(out, &key); } }; class Scalar { private: blst_scalar val; public: Scalar() { memset(&val, 0, sizeof(val)); } Scalar(const byte* scalar, size_t nbits) { blst_scalar_from_le_bytes(&val, scalar, (nbits+7)/8); } #ifndef SWIG Scalar(const byte *msg, size_t msg_len, const std::string& DST) { (void)hash_to(msg, msg_len, DST); } Scalar* hash_to(const byte *msg, size_t msg_len, const std::string& DST = "") { byte elem[48]; blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, C_bytes(DST.data()), DST.size()); blst_scalar_from_be_bytes(&val, elem, sizeof(elem)); return this; } #endif Scalar(bytes_t msg, const std::string& DST) { (void)hash_to(msg.ptr, msg.len, DST); } Scalar* hash_to(bytes_t msg, const std::string& DST = "") { return hash_to(msg.ptr, msg.len, DST); } Scalar dup() const { return *this; } Scalar* from_bendian(const byte *msg, size_t msg_len) { blst_scalar_from_be_bytes(&val, msg, msg_len); return this; } Scalar* from_lendian(const byte *msg, size_t msg_len) { blst_scalar_from_le_bytes(&val, msg, msg_len); return this; } void to_bendian(byte out[32]) const { blst_bendian_from_scalar(out, &val); } void to_lendian(byte out[32]) const { blst_lendian_from_scalar(out, &val); } Scalar* add(const Scalar& a) { if (!blst_sk_add_n_check(&val, &val, a)) throw BLST_BAD_SCALAR; return this; } Scalar* add(const SecretKey& a) { if (!blst_sk_add_n_check(&val, &val, &a.key)) throw BLST_BAD_SCALAR; return this; } Scalar* sub(const Scalar& a) { if (!blst_sk_sub_n_check(&val, &val, a)) throw BLST_BAD_SCALAR; return this; } Scalar* mul(const Scalar& a) { if (!blst_sk_mul_n_check(&val, &val, a)) throw BLST_BAD_SCALAR; return this; } Scalar* inverse() { blst_sk_inverse(&val, &val); return this; } private: friend class P1; friend class P2; operator const blst_scalar*() const { return &val; } operator const byte*() const { return val.b; } }; class P1_Affine { private: blst_p1_affine point; P1_Affine(const blst_p1_affine *cptr) { point = *cptr; } public: P1_Affine() { memset(&point, 0, sizeof(point)); } #ifndef SWIG P1_Affine(const byte *in) { BLST_ERROR err = blst_p1_deserialize(&point, in); if (err != BLST_SUCCESS) throw err; } #endif P1_Affine(const byte *in, size_t len) { if (len == 0 || len != (in[0]&0x80 ? 48 : 96)) throw BLST_BAD_ENCODING; BLST_ERROR err = blst_p1_deserialize(&point, in); if (err != BLST_SUCCESS) throw err; } P1_Affine(const P1& jacobian); P1_Affine dup() const { return *this; } P1 to_jacobian() const; void serialize(byte out[96]) const { blst_p1_affine_serialize(out, &point); } void compress(byte out[48]) const { blst_p1_affine_compress(out, &point); } bool on_curve() const { return blst_p1_affine_on_curve(&point); } bool in_group() const { return blst_p1_affine_in_g1(&point); } bool is_inf() const { return blst_p1_affine_is_inf(&point); } bool is_equal(const P1_Affine& p) const { return blst_p1_affine_is_equal(&point, &p.point); } #ifndef SWIG BLST_ERROR core_verify(const P2_Affine& pk, bool hash_or_encode, const byte* msg, size_t msg_len, const std::string& DST = "", const byte* aug = nullptr, size_t aug_len = 0) const; #endif BLST_ERROR core_verify(const P2_Affine& pk, bool hash_or_encode, bytes_t msg, const std::string& DST = "", bytes_t aug = {nullptr, 0}) const { return core_verify(pk, hash_or_encode, msg.ptr, msg.len, DST, aug.ptr, aug.len); } static P1_Affine generator() { return P1_Affine(blst_p1_affine_generator()); } private: friend class Pairing; friend class P2_Affine; friend class PT; friend class P1; friend class P1_Affines; operator const blst_p1_affine*() const { return &point; } operator blst_p1_affine*() { return &point; } }; class P1 { private: blst_p1 point; P1(const blst_p1 *cptr) { point = *cptr; } public: P1() { memset(&point, 0, sizeof(point)); } P1(const SecretKey& sk) { blst_sk_to_pk_in_g1(&point, &sk.key); } #ifndef SWIG P1(const byte *in) { blst_p1_affine a; BLST_ERROR err = blst_p1_deserialize(&a, in); if (err != BLST_SUCCESS) throw err; blst_p1_from_affine(&point, &a); } #endif P1(const byte *in, size_t len) { if (len == 0 || len != (in[0]&0x80 ? 48 : 96)) throw BLST_BAD_ENCODING; blst_p1_affine a; BLST_ERROR err = blst_p1_deserialize(&a, in); if (err != BLST_SUCCESS) throw err; blst_p1_from_affine(&point, &a); } P1(const P1_Affine& affine) { blst_p1_from_affine(&point, affine); } P1 dup() const { return *this; } P1_Affine to_affine() const { return P1_Affine(*this); } void serialize(byte out[96]) const { blst_p1_serialize(out, &point); } void compress(byte out[48]) const { blst_p1_compress(out, &point); } bool on_curve() const { return blst_p1_on_curve(&point); } bool in_group() const { return blst_p1_in_g1(&point); } bool is_inf() const { return blst_p1_is_inf(&point); } bool is_equal(const P1& p) const { return blst_p1_is_equal(&point, &p.point); } void aggregate(const P1_Affine& in) { if (blst_p1_affine_in_g1(in)) blst_p1_add_or_double_affine(&point, &point, in); else throw BLST_POINT_NOT_IN_GROUP; } P1* sign_with(const SecretKey& sk) { blst_sign_pk_in_g2(&point, &point, &sk.key); return this; } P1* sign_with(const Scalar& scalar) { blst_sign_pk_in_g2(&point, &point, scalar); return this; } P1* hash_to(bytes_t msg, const std::string& DST = "", bytes_t aug = {nullptr, 0}) { blst_hash_to_g1(&point, msg.ptr, msg.len, C_bytes(DST.data()), DST.size(), aug.ptr, aug.len); return this; } P1* encode_to(bytes_t msg, const std::string& DST = "", bytes_t aug = {nullptr, 0}) { blst_encode_to_g1(&point, msg.ptr, msg.len, C_bytes(DST.data()), DST.size(), aug.ptr, aug.len); return this; } #ifndef SWIG P1* hash_to(const byte* msg, size_t msg_len, const std::string& DST = "", const byte* aug = nullptr, size_t aug_len = 0) { blst_hash_to_g1(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), aug, aug_len); return this; } P1* encode_to(const byte* msg, size_t msg_len, const std::string& DST = "", const byte* aug = nullptr, size_t aug_len = 0) { blst_encode_to_g1(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), aug, aug_len); return this; } #endif P1* mult(const byte* scalar, size_t nbits) { blst_p1_mult(&point, &point, scalar, nbits); return this; } P1* mult(const Scalar& scalar) { blst_p1_mult(&point, &point, scalar, 255); return this; } P1* cneg(bool flag) { blst_p1_cneg(&point, flag); return this; } P1* neg() { blst_p1_cneg(&point, true); return this; } P1* add(const P1& a) { blst_p1_add_or_double(&point, &point, a); return this; } P1* add(const P1_Affine &a) { blst_p1_add_or_double_affine(&point, &point, a); return this; } P1* dbl() { blst_p1_double(&point, &point); return this; } #ifndef SWIG static P1 add(const P1& a, const P1& b) { P1 ret; blst_p1_add_or_double(ret, a, b); return ret; } static P1 add(const P1& a, const P1_Affine& b) { P1 ret; blst_p1_add_or_double_affine(ret, a, b); return ret; } static P1 dbl(const P1& a) { P1 ret; blst_p1_double(ret, a); return ret; } #endif static P1 generator() { return P1(blst_p1_generator()); } private: friend class P1_Affine; friend class P1_Affines; operator const blst_p1*() const { return &point; } operator blst_p1*() { return &point; } }; class P1_Affines { private: struct p1_affine_no_init { blst_p1_affine point; p1_affine_no_init() { } operator blst_p1_affine*() { return &point; } operator const blst_p1_affine*() const { return &point; } }; std::vector table; size_t wbits, npoints; public: #ifndef SWIG P1_Affines() {} P1_Affines(size_t wbits, const P1_Affine* const points[], size_t npoints) { this->wbits = wbits; this->npoints = npoints; table.resize(npoints << (wbits-1)); blst_p1s_mult_wbits_precompute(table.at(0), wbits, reinterpret_cast(points), npoints); } P1_Affines(size_t wbits, const P1_Affine points[], size_t npoints) { const P1_Affine* const ptrs[2] = { points, nullptr }; P1_Affines(wbits, ptrs, npoints); } P1_Affines(size_t wbits, const std::vector& points) { P1_Affines(wbits, &points.at(0), points.size()); } P1_Affines(size_t wbits, const P1* const points[], size_t npoints) { size_t cap = npoints << (wbits-1); this->wbits = wbits; this->npoints = npoints; table.resize(cap); blst_p1s_to_affine(table.at(cap-npoints), reinterpret_cast(points), npoints); const blst_p1_affine* const ptrs[2] = { table[cap-npoints], nullptr }; blst_p1s_mult_wbits_precompute(table[0], wbits, ptrs, npoints); } P1_Affines(size_t wbits, const P1 points[], size_t npoints) { const P1* const ptrs[2] = { points, nullptr }; P1_Affines(wbits, ptrs, npoints); } P1_Affines(size_t wbits, const std::vector& points) { P1_Affines(wbits, &points.at(0), points.size()); } P1_Affines(const P1* const points[], size_t npoints) { this->wbits = 0; this->npoints = npoints; table.resize(npoints); blst_p1s_to_affine(table.at(0), reinterpret_cast(points), npoints); } P1_Affines(const P1 points[], size_t npoints) { const P1* const ptrs[2] = { points, nullptr }; P1_Affines(ptrs, npoints); } P1_Affines(const std::vector& points) { P1_Affines(&points.at(0), points.size()); } P1 mult(const byte* const scalars[], size_t nbits) const { P1 ret; if (wbits != 0) { size_t sz = blst_p1s_mult_wbits_scratch_sizeof(npoints); std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; blst_p1s_mult_wbits(ret, table.at(0), wbits, npoints, scalars, nbits, scratch.get()); } else { size_t sz = blst_p1s_mult_pippenger_scratch_sizeof(npoints); std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; const blst_p1_affine* const ptrs[2] = { table.at(0), nullptr }; blst_p1s_mult_pippenger(ret, ptrs, npoints, scalars, nbits, scratch.get()); } return ret; } static std::vector from(const P1* const points[], size_t npoints) { std::vector ret; ret.resize(npoints); blst_p1s_to_affine(reinterpret_cast(&ret.at(0)), reinterpret_cast(points), npoints); return ret; } static std::vector from(const P1 points[], size_t npoints) { const P1* const ptrs[2] = { points, nullptr }; return from(ptrs, npoints); } static std::vector from(const std::vector& points) { return from(&points.at(0), points.size()); } #endif static P1 mult_pippenger(const P1_Affine* const points[], size_t npoints, const byte* const scalars[], size_t nbits) { P1 ret; size_t sz = blst_p1s_mult_pippenger_scratch_sizeof(npoints); std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; blst_p1s_mult_pippenger(ret, reinterpret_cast(points), npoints, scalars, nbits, scratch.get()); return ret; } #ifndef SWIG static P1 mult_pippenger(const P1_Affine points[], size_t npoints, const byte* const scalars[], size_t nbits) { const P1_Affine* const ptrs[2] = { points, nullptr }; return mult_pippenger(ptrs, npoints, scalars, nbits); } static P1 mult_pippenger(const std::vector& points, const byte* const scalars[], size_t nbits) { return mult_pippenger(&points.at(0), points.size(), scalars, nbits); } #endif static P1 add(const P1_Affine* const points[], size_t npoints) { P1 ret; blst_p1s_add(ret, reinterpret_cast(points), npoints); return ret; } #ifndef SWIG static P1 add(const P1_Affine points[], size_t npoints) { const P1_Affine* const ptrs[2] = { points, nullptr }; return add(ptrs, npoints); } static P1 add(const std::vector& points) { return add(&points.at(0), points.size()); } #endif }; class P2_Affine { private: blst_p2_affine point; P2_Affine(const blst_p2_affine *cptr) { point = *cptr; } public: P2_Affine() { memset(&point, 0, sizeof(point)); } #ifndef SWIG P2_Affine(const byte *in) { BLST_ERROR err = blst_p2_deserialize(&point, in); if (err != BLST_SUCCESS) throw err; } #endif P2_Affine(const byte *in, size_t len) { if (len == 0 || len != (in[0]&0x80 ? 96 : 192)) throw BLST_BAD_ENCODING; BLST_ERROR err = blst_p2_deserialize(&point, in); if (err != BLST_SUCCESS) throw err; } P2_Affine(const P2& jacobian); P2_Affine dup() const { return *this; } P2 to_jacobian() const; void serialize(byte out[192]) const { blst_p2_affine_serialize(out, &point); } void compress(byte out[96]) const { blst_p2_affine_compress(out, &point); } bool on_curve() const { return blst_p2_affine_on_curve(&point); } bool in_group() const { return blst_p2_affine_in_g2(&point); } bool is_inf() const { return blst_p2_affine_is_inf(&point); } bool is_equal(const P2_Affine& p) const { return blst_p2_affine_is_equal(&point, &p.point); } #ifndef SWIG BLST_ERROR core_verify(const P1_Affine& pk, bool hash_or_encode, const byte* msg, size_t msg_len, const std::string& DST = "", const byte* aug = nullptr, size_t aug_len = 0) const; #endif BLST_ERROR core_verify(const P1_Affine& pk, bool hash_or_encode, bytes_t msg, const std::string& DST = "", bytes_t aug = {nullptr, 0}) const { return core_verify(pk, hash_or_encode, msg.ptr, msg.len, DST, aug.ptr, aug.len); } static P2_Affine generator() { return P2_Affine(blst_p2_affine_generator()); } private: friend class Pairing; friend class P1_Affine; friend class PT; friend class P2; friend class P2_Affines; operator const blst_p2_affine*() const { return &point; } operator blst_p2_affine*() { return &point; } }; class P2 { private: blst_p2 point; P2(const blst_p2 *cptr) { point = *cptr; } public: P2() { memset(&point, 0, sizeof(point)); } P2(const SecretKey& sk) { blst_sk_to_pk_in_g2(&point, &sk.key); } #ifndef SWIG P2(const byte *in) { blst_p2_affine a; BLST_ERROR err = blst_p2_deserialize(&a, in); if (err != BLST_SUCCESS) throw err; blst_p2_from_affine(&point, &a); } #endif P2(const byte *in, size_t len) { if (len == 0 || len != (in[0]&0x80 ? 96 : 192)) throw BLST_BAD_ENCODING; blst_p2_affine a; BLST_ERROR err = blst_p2_deserialize(&a, in); if (err != BLST_SUCCESS) throw err; blst_p2_from_affine(&point, &a); } P2(const P2_Affine& affine) { blst_p2_from_affine(&point, affine); } P2 dup() const { return *this; } P2_Affine to_affine() const { return P2_Affine(*this); } void serialize(byte out[192]) const { blst_p2_serialize(out, &point); } void compress(byte out[96]) const { blst_p2_compress(out, &point); } bool on_curve() const { return blst_p2_on_curve(&point); } bool in_group() const { return blst_p2_in_g2(&point); } bool is_inf() const { return blst_p2_is_inf(&point); } bool is_equal(const P2& p) const { return blst_p2_is_equal(&point, &p.point); } void aggregate(const P2_Affine& in) { if (blst_p2_affine_in_g2(in)) blst_p2_add_or_double_affine(&point, &point, in); else throw BLST_POINT_NOT_IN_GROUP; } P2* sign_with(const SecretKey& sk) { blst_sign_pk_in_g1(&point, &point, &sk.key); return this; } P2* sign_with(const Scalar& scalar) { blst_sign_pk_in_g1(&point, &point, scalar); return this; } P2* hash_to(bytes_t msg, const std::string& DST = "", bytes_t aug = {nullptr, 0}) { blst_hash_to_g2(&point, msg.ptr, msg.len, C_bytes(DST.data()), DST.size(), aug.ptr, aug.len); return this; } P2* encode_to(bytes_t msg, const std::string& DST = "", bytes_t aug = {nullptr, 0}) { blst_encode_to_g2(&point, msg.ptr, msg.len, C_bytes(DST.data()), DST.size(), aug.ptr, aug.len); return this; } #ifndef SWIG P2* hash_to(const byte* msg, size_t msg_len, const std::string& DST = "", const byte* aug = nullptr, size_t aug_len = 0) { blst_hash_to_g2(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), aug, aug_len); return this; } P2* encode_to(const byte* msg, size_t msg_len, const std::string& DST = "", const byte* aug = nullptr, size_t aug_len = 0) { blst_encode_to_g2(&point, msg, msg_len, C_bytes(DST.data()), DST.size(), aug, aug_len); return this; } #endif P2* mult(const byte* scalar, size_t nbits) { blst_p2_mult(&point, &point, scalar, nbits); return this; } P2* mult(const Scalar& scalar) { blst_p2_mult(&point, &point, scalar, 255); return this; } P2* cneg(bool flag) { blst_p2_cneg(&point, flag); return this; } P2* neg() { blst_p2_cneg(&point, true); return this; } P2* add(const P2& a) { blst_p2_add_or_double(&point, &point, a); return this; } P2* add(const P2_Affine &a) { blst_p2_add_or_double_affine(&point, &point, a); return this; } P2* dbl() { blst_p2_double(&point, &point); return this; } #ifndef SWIG static P2 add(const P2& a, const P2& b) { P2 ret; blst_p2_add_or_double(ret, a, b); return ret; } static P2 add(const P2& a, const P2_Affine& b) { P2 ret; blst_p2_add_or_double_affine(ret, a, b); return ret; } static P2 dbl(const P2& a) { P2 ret; blst_p2_double(ret, a); return ret; } #endif static P2 generator() { return P2(blst_p2_generator()); } private: friend class P2_Affine; friend class P2_Affines; operator const blst_p2*() const { return &point; } operator blst_p2*() { return &point; } }; class P2_Affines { private: struct p2_affine_no_init { blst_p2_affine point; p2_affine_no_init() { } operator blst_p2_affine*() { return &point; } operator const blst_p2_affine*() const { return &point; } }; std::vector table; size_t wbits, npoints; public: #ifndef SWIG P2_Affines() {} P2_Affines(size_t wbits, const P2_Affine* const points[], size_t npoints) { this->wbits = wbits; this->npoints = npoints; table.resize(npoints << (wbits-1)); blst_p2s_mult_wbits_precompute(table.at(0), wbits, reinterpret_cast(points), npoints); } P2_Affines(size_t wbits, const P2_Affine points[], size_t npoints) { const P2_Affine* const ptrs[2] = { points, nullptr }; P2_Affines(wbits, ptrs, npoints); } P2_Affines(size_t wbits, const std::vector& points) { P2_Affines(wbits, &points.at(0), points.size()); } P2_Affines(size_t wbits, const P2* const points[], size_t npoints) { size_t cap = npoints << (wbits-1); this->wbits = wbits; this->npoints = npoints; table.resize(cap); blst_p2s_to_affine(table.at(cap-npoints), reinterpret_cast(points), npoints); const blst_p2_affine* const ptrs[2] = { table[cap-npoints], nullptr }; blst_p2s_mult_wbits_precompute(table[0], wbits, ptrs, npoints); } P2_Affines(size_t wbits, const P2 points[], size_t npoints) { const P2* const ptrs[2] = { points, nullptr }; P2_Affines(wbits, ptrs, npoints); } P2_Affines(size_t wbits, const std::vector& points) { P2_Affines(wbits, &points.at(0), points.size()); } P2_Affines(const P2* const points[], size_t npoints) { this->wbits = 0; this->npoints = npoints; table.resize(npoints); blst_p2s_to_affine(table.at(0), reinterpret_cast(points), npoints); } P2_Affines(const P2 points[], size_t npoints) { const P2* const ptrs[2] = { points, nullptr }; P2_Affines(ptrs, npoints); } P2_Affines(const std::vector& points) { P2_Affines(&points.at(0), points.size()); } P2 mult(const byte* const scalars[], size_t nbits) const { P2 ret; if (wbits != 0) { size_t sz = blst_p2s_mult_wbits_scratch_sizeof(npoints); std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; blst_p2s_mult_wbits(ret, table.at(0), wbits, npoints, scalars, nbits, scratch.get()); } else { size_t sz = blst_p2s_mult_pippenger_scratch_sizeof(npoints); std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; const blst_p2_affine* const ptrs[2] = { table.at(0), nullptr }; blst_p2s_mult_pippenger(ret, ptrs, npoints, scalars, nbits, scratch.get()); } return ret; } static std::vector from(const P2* const points[], size_t npoints) { std::vector ret; ret.resize(npoints); blst_p2s_to_affine(reinterpret_cast(&ret.at(0)), reinterpret_cast(points), npoints); return ret; } static std::vector from(const P2 points[], size_t npoints) { const P2* const ptrs[2] = { points, nullptr }; return from(ptrs, npoints); } static std::vector from(const std::vector& points) { return from(&points.at(0), points.size()); } #endif static P2 mult_pippenger(const P2_Affine* const points[], size_t npoints, const byte* const scalars[], size_t nbits) { P2 ret; size_t sz = blst_p2s_mult_pippenger_scratch_sizeof(npoints); std::unique_ptr scratch{new limb_t[sz/sizeof(limb_t)]}; blst_p2s_mult_pippenger(ret, reinterpret_cast(points), npoints, scalars, nbits, scratch.get()); return ret; } #ifndef SWIG static P2 mult_pippenger(const P2_Affine points[], size_t npoints, const byte* const scalars[], size_t nbits) { const P2_Affine* const ptrs[2] = { points, nullptr }; return mult_pippenger(ptrs, npoints, scalars, nbits); } static P2 mult_pippenger(const std::vector& points, const byte* const scalars[], size_t nbits) { return mult_pippenger(&points.at(0), points.size(), scalars, nbits); } #endif static P2 add(const P2_Affine* const points[], size_t npoints) { P2 ret; blst_p2s_add(ret, reinterpret_cast(points), npoints); return ret; } #ifndef SWIG static P2 add(const P2_Affine points[], size_t npoints) { const P2_Affine* const ptrs[2] = { points, nullptr }; return add(ptrs, npoints); } static P2 add(const std::vector& points) { return add(&points.at(0), points.size()); } #endif }; inline P1_Affine::P1_Affine(const P1& jacobian) { blst_p1_to_affine(&point, jacobian); } inline P2_Affine::P2_Affine(const P2& jacobian) { blst_p2_to_affine(&point, jacobian); } inline P1 P1_Affine::to_jacobian() const { P1 ret(*this); return ret; } inline P2 P2_Affine::to_jacobian() const { P2 ret(*this); return ret; } inline P1 G1() { return P1::generator(); } inline P2 G2() { return P2::generator(); } #ifndef SWIG inline BLST_ERROR P1_Affine::core_verify(const P2_Affine& pk, bool hash_or_encode, const byte* msg, size_t msg_len, const std::string& DST, const byte* aug, size_t aug_len) const { return blst_core_verify_pk_in_g2(pk, &point, hash_or_encode, msg, msg_len, C_bytes(DST.data()), DST.size(), aug, aug_len); } inline BLST_ERROR P2_Affine::core_verify(const P1_Affine& pk, bool hash_or_encode, const byte* msg, size_t msg_len, const std::string& DST, const byte* aug, size_t aug_len) const { return blst_core_verify_pk_in_g1(pk, &point, hash_or_encode, msg, msg_len, C_bytes(DST.data()), DST.size(), aug, aug_len); } #endif class PT { private: blst_fp12 value; PT(const blst_fp12 *v) { value = *v; } public: PT(const P1_Affine& p) { blst_aggregated_in_g1(&value, p); } PT(const P2_Affine& q) { blst_aggregated_in_g2(&value, q); } PT(const P2_Affine& q, const P1_Affine& p) { blst_miller_loop(&value, q, p); } PT(const P1_Affine& p, const P2_Affine& q) : PT(q, p) {} PT(const P2& q, const P1& p) { blst_miller_loop(&value, P2_Affine(q), P1_Affine(p)); } PT(const P1& p, const P2& q) : PT(q, p) {} PT dup() const { return *this; } bool is_one() const { return blst_fp12_is_one(&value); } bool is_equal(const PT& p) const { return blst_fp12_is_equal(&value, p); } PT* sqr() { blst_fp12_sqr(&value, &value); return this; } PT* mul(const PT& p) { blst_fp12_mul(&value, &value, p); return this; } PT* final_exp() { blst_final_exp(&value, &value); return this; } bool in_group() const { return blst_fp12_in_group(&value); } void to_bendian(byte out[48*12]) const { blst_bendian_from_fp12(out, &value); } static bool finalverify(const PT& gt1, const PT& gt2) { return blst_fp12_finalverify(gt1, gt2); } static PT one() { return PT(blst_fp12_one()); } private: friend class Pairing; operator const blst_fp12*() const { return &value; } }; class Pairing { private: operator blst_pairing*() { return reinterpret_cast(this); } operator const blst_pairing*() const { return reinterpret_cast(this); } void init(bool hash_or_encode, const byte* DST, size_t DST_len) { // Copy DST to heap, std::string can be volatile, especially in SWIG:-( byte *dst = new byte[DST_len]; memcpy(dst, DST, DST_len); blst_pairing_init(*this, hash_or_encode, dst, DST_len); } public: #ifndef SWIG void* operator new(size_t) { return new uint64_t[blst_pairing_sizeof()/sizeof(uint64_t)]; } void operator delete(void *ptr) { delete[] static_cast(ptr); } Pairing(bool hash_or_encode, const std::string& DST) { init(hash_or_encode, C_bytes(DST.data()), DST.size()); } #endif #ifndef SWIGJAVA Pairing(bool hash_or_encode, const byte* DST, size_t DST_len) { init(hash_or_encode, DST, DST_len); } ~Pairing() { delete[] blst_pairing_get_dst(*this); } #endif BLST_ERROR aggregate(const P1_Affine* pk, const P2_Affine* sig, bytes_t msg, bytes_t aug = {nullptr, 0}) { return blst_pairing_aggregate_pk_in_g1(*this, *pk, *sig, msg.ptr, msg.len, aug.ptr, aug.len); } BLST_ERROR aggregate(const P2_Affine* pk, const P1_Affine* sig, bytes_t msg, bytes_t aug = {nullptr, 0}) { return blst_pairing_aggregate_pk_in_g2(*this, *pk, *sig, msg.ptr, msg.len, aug.ptr, aug.len); } BLST_ERROR mul_n_aggregate(const P1_Affine* pk, const P2_Affine* sig, const byte* scalar, size_t nbits, bytes_t msg, bytes_t aug = {nullptr, 0}) { return blst_pairing_mul_n_aggregate_pk_in_g1(*this, *pk, *sig, scalar, nbits, msg.ptr, msg.len, aug.ptr, aug.len); } BLST_ERROR mul_n_aggregate(const P2_Affine* pk, const P1_Affine* sig, const byte* scalar, size_t nbits, bytes_t msg, bytes_t aug = {nullptr, 0}) { return blst_pairing_mul_n_aggregate_pk_in_g2(*this, *pk, *sig, scalar, nbits, msg.ptr, msg.len, aug.ptr, aug.len); } #ifndef SWIG BLST_ERROR aggregate(const P1_Affine* pk, const P2_Affine* sig, const byte* msg, size_t msg_len, const byte* aug = nullptr, size_t aug_len = 0) { return blst_pairing_aggregate_pk_in_g1(*this, *pk, *sig, msg, msg_len, aug, aug_len); } BLST_ERROR aggregate(const P2_Affine* pk, const P1_Affine* sig, const byte* msg, size_t msg_len, const byte* aug = nullptr, size_t aug_len = 0) { return blst_pairing_aggregate_pk_in_g2(*this, *pk, *sig, msg, msg_len, aug, aug_len); } BLST_ERROR mul_n_aggregate(const P1_Affine* pk, const P2_Affine* sig, const byte* scalar, size_t nbits, const byte* msg, size_t msg_len, const byte* aug = nullptr, size_t aug_len = 0) { return blst_pairing_mul_n_aggregate_pk_in_g1(*this, *pk, *sig, scalar, nbits, msg, msg_len, aug, aug_len); } BLST_ERROR mul_n_aggregate(const P2_Affine* pk, const P1_Affine* sig, const byte* scalar, size_t nbits, const byte* msg, size_t msg_len, const byte* aug = nullptr, size_t aug_len = 0) { return blst_pairing_mul_n_aggregate_pk_in_g2(*this, *pk, *sig, scalar, nbits, msg, msg_len, aug, aug_len); } #endif void commit() { blst_pairing_commit(*this); } BLST_ERROR merge(const Pairing* ctx) { return blst_pairing_merge(*this, *ctx); } bool finalverify(const PT* sig = nullptr) const { return sig == nullptr ? blst_pairing_finalverify(*this, nullptr) : blst_pairing_finalverify(*this, *sig); } void raw_aggregate(const P2_Affine* q, const P1_Affine* p) { blst_pairing_raw_aggregate(*this, *q, *p); } PT as_fp12() { return PT(blst_pairing_as_fp12(*this)); } }; } // namespace blst #endif ================================================ FILE: bindings/blst.swg ================================================ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 %module blst %rename("%(strip:[blst_])s") ""; // prefix is redundant in named module %include "exception.i" #ifdef __cplusplus %include "std_string.i" %typemap(out) SELF* OUTPUT = SWIGTYPE*; // to be overridden as required #else #warning consider using C++ interface #endif %include "stdint.i" %apply const char* { const byte*, const byte[ANY] } %apply (const char *STRING, size_t LENGTH) { (const byte *STRING, size_t LENGTH) } #if defined(SWIGPYTHON) %header %{ #if PY_VERSION_HEX<0x030d0000 /* Tailored polyfill, for example no need to handle |n_bytes| == 0 here */ static Py_ssize_t PyLong_AsNativeBytes(PyObject* v, void* buffer, Py_ssize_t n_bytes, int flags) { return _PyLong_AsByteArray((PyLongObject*)v, (unsigned char*)buffer, n_bytes, flags&1, (flags&4) == 0) < 0 ? -1 : n_bytes; } # define My_PYLONG_FLAGS (1 | 4 | 8) #else # define My_PYLONG_FLAGS (Py_ASNATIVEBYTES_LITTLE_ENDIAN | \ Py_ASNATIVEBYTES_UNSIGNED_BUFFER | \ Py_ASNATIVEBYTES_REJECT_NEGATIVE) #endif #if PY_VERSION_HEX<0x030e0000 static int PyLong_GetSign(PyObject *obj, int *sign) { if (!PyLong_Check(obj)) return -1; *sign = _PyLong_Sign(obj); return 0; } #endif %} // some sorcery to allow assignments as output, e.g. // hash = blst.encode_to_g1(b"foo") %typemap(in, numinputs=0) OBJECT *OUTPUT($1_basetype temp) %{ $1 = &temp; %} %typemap(argout) OBJECT *OUTPUT { PyObject *obj = SWIG_NewPointerObj(memcpy(malloc(sizeof($1_basetype)), $1,sizeof($1_basetype)), $descriptor, SWIG_POINTER_NEW); $result = SWIG_AppendOutput($result, obj); } %apply OBJECT *OUTPUT { blst_p1 *out, blst_p1 *out_pk, blst_p1 *out_sig, blst_p1_affine *out, blst_p1_affine *out_pk, blst_p1_affine *out_sig, blst_p2 *out, blst_p2 *out_pk, blst_p2 *out_sig, blst_p2_affine *out, blst_p2_affine *out_pk, blst_p2_affine *out_sig, blst_scalar *out, blst_scalar *out_SK, blst_fp12 *out } // accept 'bytes' and 'bytearray' as inputs... %typemap(in) const byte* %{ if ($input == Py_None) { $1 = NULL; } else if (PyBytes_Check($input)) { char *buf; Py_ssize_t nbytes; if (PyBytes_AsStringAndSize($input, &buf, &nbytes) < 0) SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); $1 = ($1_ltype)buf; } else if (PyByteArray_Check($input)) { $1 = ($1_ltype)PyByteArray_AsString($input); } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'bytes' or 'bytearray'"); } %} %typemap(freearg) const byte* "" %typemap(in) const byte[ANY] %{ if (PyBytes_Check($input)) { char *buf; Py_ssize_t nbytes; if (PyBytes_AsStringAndSize($input, &buf, &nbytes) < 0) SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); if (nbytes != $1_dim0) SWIG_exception_fail(SWIG_ValueError, "in method '$symname', " "expecting $1_dim0 bytes"); $1 = ($1_ltype)buf; } else if (PyByteArray_Check($input)) { if (PyByteArray_Size($input) != $1_dim0) SWIG_exception_fail(SWIG_ValueError, "in method '$symname', " "expecting $1_dim0 bytes"); $1 = ($1_ltype)PyByteArray_AsString($input); } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'bytes' or 'bytearray'"); } %} %typemap(freearg) const byte[ANY] "" %typemap(in) (const byte *STRING, size_t LENGTH) %{ if ($input == Py_None) { $1 = NULL; $2 = 0; } else if (PyBytes_Check($input)) { char *buf; Py_ssize_t nbytes; if (PyBytes_AsStringAndSize($input, &buf, &nbytes) < 0) SWIG_exception_fail(SWIG_ValueError, "in method '$symname'"); $1 = ($1_ltype)buf; $2 = nbytes; } else if (PyByteArray_Check($input)) { $1 = ($1_ltype)PyByteArray_AsString($input); $2 = PyByteArray_Size($input); #ifdef Py_USING_UNICODE } else if (PyUnicode_Check($input)) { char *buf; Py_ssize_t nbytes; PyObject *obj = PyUnicode_AsUTF8String($input); if (obj == NULL || PyBytes_AsStringAndSize(obj, &buf, &nbytes) < 0) SWIG_exception_fail(SWIG_ValueError, "in method '$symname'"); $1 = ($1_ltype)alloca($2 = nbytes); memcpy($1, buf, $2); Py_DECREF(obj); #endif } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'bytes' or 'bytearray'"); } %} %typemap(freearg) (const byte *STRING, size_t LENGTH) "" %typemap(in) blst::bytes_t %{ if ($input == Py_None) { $1.ptr = NULL; $1.len = 0; } else if (PyBytes_Check($input)) { char *buf; Py_ssize_t nbytes; if (PyBytes_AsStringAndSize($input, &buf, &nbytes) < 0) SWIG_exception_fail(SWIG_ValueError, "in method '$symname'"); $1.ptr = (byte*)buf; $1.len = nbytes; } else if (PyByteArray_Check($input)) { $1.ptr = (byte*)PyByteArray_AsString($input); $1.len = PyByteArray_Size($input); #ifdef Py_USING_UNICODE } else if (PyUnicode_Check($input)) { char *buf; Py_ssize_t nbytes; PyObject *obj = PyUnicode_AsUTF8String($input); if (obj == NULL || PyBytes_AsStringAndSize(obj, &buf, &nbytes) < 0) SWIG_exception_fail(SWIG_ValueError, "in method '$symname'"); auto ptr = alloca(nbytes); memcpy(ptr, buf, nbytes); $1.ptr = (byte*)ptr; $1.len = nbytes; Py_DECREF(obj); #endif } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'bytes' or 'bytearray'"); } %} %typemap(freearg) blst::bytes_t "" %typemap(typecheck) blst::bytes_t "" // let users use Python 'int', 'bytes' and 'bytearray' as scalars %typemap(in) (const byte* scalar, size_t nbits) %{ if (PyBytes_Check($input)) { char *scalar; Py_ssize_t nbytes; if (PyBytes_AsStringAndSize($input, &scalar, &nbytes) < 0) SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); $1 = ($1_ltype)scalar; $2 = 8 * nbytes; } else if (PyByteArray_Check($input)) { $1 = ($1_ltype)PyByteArray_AsString($input); $2 = 8 * PyByteArray_Size($input); } else if (PyLong_Check($input)) { size_t nbytes; $2 = _PyLong_NumBits($input); $1 = ($1_ltype)alloca(nbytes = ($2 + 7)/8); if (PyLong_AsNativeBytes($input, $1, nbytes, My_PYLONG_FLAGS) < 0) SWIG_exception_fail(SWIG_OverflowError, "in method '$symname'"); } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'int', 'bytes' " "or 'bytearray'"); } %} #ifdef __cplusplus %typemap(in) (const POINT* points[], size_t npoints) (std::unique_ptr<$*1_ltype[]> points, size_t _global_npoints) %{ if (PyList_Check($input)) { _global_npoints = PyList_Size($input); points = std::unique_ptr<$*1_ltype[]>(new $*1_ltype[_global_npoints]); PyObject* obj = PyList_GET_ITEM($input, 0); // check the type of the 1st element if (SWIG_ConvertPtr(obj, (void**)&points[0], $*1_descriptor, 0) != SWIG_OK) SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'list' of '$*1_ltype'"); for (size_t i = 1; i < _global_npoints; i++) { obj = PyList_GET_ITEM($input, i); points[i] = ($*1_ltype)SWIG_Python_GetSwigThis(obj)->ptr; } $1 = points.get(); $2 = _global_npoints; } else if (PyBytes_Check($input)) { char *bytes; Py_ssize_t nbytes; if (PyBytes_AsStringAndSize($input, &bytes, &nbytes) < 0) SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); points = std::unique_ptr<$*1_ltype[]>(new $*1_ltype[2]); points[0] = ($*1_ltype)bytes; points[1] = nullptr; $1 = points.get(); $2 = _global_npoints = nbytes / sizeof(points[0][0]); } else if (PyMemoryView_Check($input)) { // output from to_affine() Py_buffer *buf = PyMemoryView_GET_BUFFER($input); if (!PyBytes_Check(buf->obj)) SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'bytes' in " "'memoryview' for points[]"); points = std::unique_ptr<$*1_ltype[]>(new $*1_ltype[2]); points[0] = ($*1_ltype)buf->buf; points[1] = nullptr; $1 = points.get(); $2 = _global_npoints = buf->len / sizeof(points[0][0]); } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', expecting " "'list', 'bytes' or 'memoryview' " "for points[]"); } %} %apply (const POINT* points[], size_t npoints) { (const blst::P1_Affine* const points[], size_t npoints), (const blst::P2_Affine* const points[], size_t npoints), (const blst::P1* const points[], size_t npoints), (const blst::P2* const points[], size_t npoints) } %typemap(in, numinputs=0) POINT points[] (PyObject *obj) "" %typemap(check) POINT points[] { char *bytes; Py_ssize_t size = sizeof($1[0]) * _global_npoints; obj$argnum = PyBytes_FromStringAndSize(NULL, size); if (obj$argnum == NULL) SWIG_fail; PyBytes_AsStringAndSize(obj$argnum, &bytes, &size); $1 = ($1_ltype)bytes; } %typemap(argout) POINT points[] %{ $result = PyMemoryView_FromObject(obj$argnum); if ($result != NULL) { // .itemsize to return size of point, and len() - amount of points PyMemoryView_GET_BUFFER($result)->itemsize = sizeof($1[0]); PyMemoryView_GET_BUFFER($result)->shape[0] /= sizeof($1[0]); } else { Py_DECREF(obj$argnum); } %} %apply POINT points[] { blst_p1_affine dst[], blst_p2_affine dst[] } %extend blst::P1_Affines { static PyObject* as_memory(blst_p1_affine dst[], const blst::P1* const points[], size_t npoints) { blst_p1s_to_affine(dst, (const blst_p1 *const*)points, npoints); return Py_None; // ignored by 'argout' typemap above } } %extend blst::P2_Affines { static PyObject* as_memory(blst_p2_affine dst[], const blst::P2* const points[], size_t npoints) { blst_p2s_to_affine(dst, (const blst_p2 *const*)points, npoints); return Py_None; // ignored by 'argout' typemap above } } %nodefault blst::P1_Affines; %nodefault blst::P2_Affines; %typemap(in) (const byte* const scalars[], size_t nbits) (std::unique_ptr bytes, byte *scalars[2]) %{ if (PyList_Check($input)) { if ((size_t)PyList_Size($input) != _global_npoints) SWIG_exception_fail(SWIG_IndexError, "in method '$symname', 'list' " "length mismatch for scalars[]"); PyObject *obj = PyList_GET_ITEM($input, 0); if (PyLong_Check(obj)) { $2 = _PyLong_NumBits(obj); for (size_t i = 1; i < _global_npoints; i++) { size_t nbits; int sign; obj = PyList_GET_ITEM($input, i); if (PyLong_GetSign(obj, &sign) < 0 || sign < 0) SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting all 'long's"); nbits = _PyLong_NumBits(obj); if (nbits > $2) $2 = nbits; } size_t nbytes = ($2 + 7)/8; bytes = std::unique_ptr(new byte[_global_npoints*nbytes]); byte* scalar = bytes.get(); for (size_t i = 0; i < _global_npoints; i++, scalar += nbytes) PyLong_AsNativeBytes(PyList_GET_ITEM($input, i), scalar, nbytes, My_PYLONG_FLAGS); scalars[0] = bytes.get(); scalars[1] = nullptr; $1 = scalars; } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'list' of 'long's " "for scalars[]"); } } else if (PyBytes_Check($input)) { char *bytes; Py_ssize_t nbytes; if (PyBytes_AsStringAndSize($input, &bytes, &nbytes) < 0) SWIG_exception_fail(SWIG_TypeError, "in method '$symname'"); scalars[0] = ($*1_ltype)bytes; scalars[1] = nullptr; $1 = scalars; $2 = 8 * (nbytes / _global_npoints); } else if (PyByteArray_Check($input)) { scalars[0] = ($*1_ltype)PyByteArray_AsString($input); scalars[1] = nullptr; $1 = scalars; $2 = 8 * (PyByteArray_Size($input) / _global_npoints); } else if (PyMemoryView_Check($input)) { Py_buffer *buf = PyMemoryView_GET_BUFFER($input); if (!PyBytes_Check(buf->obj) && !PyByteArray_Check(buf->obj)) SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting 'bytes' in " "'memoryview' for points[]"); scalars[0] = ($*1_ltype)buf->buf; scalars[1] = nullptr; $1 = scalars; $2 = 8 * (buf->len / _global_npoints); } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', expecting " "'list', 'bytes', 'bytearray' " "or 'memoryview' for scalars[]"); } %} %typemap(out) BLST_ERROR %{ if ($1 != BLST_SUCCESS) { SWIG_exception(SWIG_ValueError, BLST_ERROR_str[$1]); SWIG_fail; } $result = SWIG_From_int($1); %} // return |this| %typemap(out) SELF* OUTPUT %{ (void)$1; Py_INCREF($result = swig_obj[0]); %} #endif #elif defined(SWIGJAVA) %header %{ #ifdef __cplusplus # define JCALL(func, ...) jenv->func(__VA_ARGS__) #else # define JCALL(func, ...) (*jenv)->func(jenv, __VA_ARGS__) #endif %} %include "enums.swg" %include "arrays_java.i" %javaconst(1); #if SWIG_VERSION < 0x040000 %apply (char *STRING, size_t LENGTH) { (const byte *STRING, size_t LENGTH) } #endif %pragma(java) jniclassimports=%{ import java.io.*; import java.nio.file.*; %} %pragma(java) jniclasscode=%{ final static String libName = System.mapLibraryName("$module"); final static String resName = System.getProperty("os.name").replaceFirst(" .*","") + "/" + System.getProperty("os.arch") + "/" + libName; static { Class imClazz = $imclassname.class; InputStream res = imClazz.getResourceAsStream( System.getProperty(imClazz.getPackageName() + ".jniResource", resName)); if (res == null) { try { System.loadLibrary("$module"); } catch (UnsatisfiedLinkError e) { String[] cmd = System.getProperty("sun.java.command").split("/"); if (!"$imclassname".equals(cmd[cmd.length-1])) // suppress exception if 'main' below is executed throw new RuntimeException(e.getMessage()); } } else { // unpack shared library into a temporary directory and load it try { Path tmpdir = Files.createTempDirectory("$module@"); tmpdir.toFile().deleteOnExit(); Path tmpdll = Paths.get(tmpdir.toString(), libName); tmpdll.toFile().deleteOnExit(); Files.copy(res, tmpdll, StandardCopyOption.REPLACE_EXISTING); res.close(); System.load(tmpdll.toString()); } catch (IOException e) { throw new RuntimeException(e.getMessage()); } } } public static void main(String argv[]) { System.out.println(resName); } %} #ifdef __cplusplus // Extensive sorcery to shift memory management to JVM GC. General idea is // to use Java long[] as opaque storage for blst data. Methods that return // new objects allocate suitably sized long[] arrays from JVM heap, // references to which are then assigned to |swigCPtr| on the Java side. // And when passed back to JNI, |swigCPtr|s are dereferenced with // GetLongArrayElements... And no destructors! %nodefaultdtor; %typemap(javafinalize) SWIGTYPE "" %typemap(javadestruct) SWIGTYPE "" %typemap(javabody) SWIGTYPE %{ private transient long[] swigCPtr; protected $javaclassname(long[] cPtr) { swigCPtr = cPtr; } protected static long[] getCPtr($javaclassname obj) { return obj != null ? obj.swigCPtr : null; } public $javaclassname dup() { return new $javaclassname(swigCPtr.clone()); } %} %ignore dup; %typemap(javaconstruct) SWIGTYPE { this($imcall); } %typemap(jni) SWIGTYPE, SWIGTYPE&, SWIGTYPE* "jlongArray" %typemap(jtype) SWIGTYPE, SWIGTYPE&, SWIGTYPE* "long[]" %typemap(javaout) SWIGTYPE, SWIGTYPE&, SWIGTYPE* { return new $javaclassname($jnicall); } %typemap(in) SWIGTYPE&, SWIGTYPE* %{ $1 = ($1_ltype)JCALL(GetLongArrayElements, $input, 0); %} %typemap(in) const SWIGTYPE&, const SWIGTYPE* %{ $1 = $input ? ($1_ltype)JCALL(GetLongArrayElements, $input, 0) : NULL; %} %typemap(out) SWIGTYPE&, SWIGTYPE* %{ if ($1 != $null) { size_t sz = (sizeof($1_basetype) + sizeof(jlong) - 1)/sizeof(jlong); $result = JCALL(NewLongArray, sz); if ($result != $null) JCALL(SetLongArrayRegion, $result, 0, sz, (const jlong *)$1); } %} %typemap(out) SWIGTYPE { size_t sz = (sizeof($1_basetype) + sizeof(jlong) - 1)/sizeof(jlong); $result = JCALL(NewLongArray, sz); if ($result != $null) JCALL(SetLongArrayRegion, $result, 0, sz, (const jlong *)&$1); } %typemap(newfree) SWIGTYPE* "delete $1;" %typemap(freearg) SWIGTYPE&, SWIGTYPE* %{ JCALL(ReleaseLongArrayElements, $input, (jlong *)$1, 0); %} %typemap(freearg) const SWIGTYPE&, const SWIGTYPE* %{ if ($input) JCALL(ReleaseLongArrayElements, $input, (jlong *)$1, JNI_ABORT); %} %typemap(freearg) const std::string& "" // I wish |jenv| was available in the constructor, so that NewLongArray // could be called at once, without having to resort to matching // %typemap(out)... %extend blst::Pairing { Pairing(bool hash_or_encode, const std::string& DST) { size_t sz = blst_pairing_sizeof(); size_t SZ = (sz + DST.size() + sizeof(jlong) - 1)/sizeof(jlong); blst_pairing *ret = (blst_pairing *)malloc(SZ*sizeof(jlong)); if (DST.size() != 0) { byte *dst = (byte *)ret + sz; memcpy(dst, DST.data(), DST.size()); blst_pairing_init(ret, hash_or_encode, dst, DST.size()); } else { blst_pairing_init(ret, hash_or_encode, NULL, 0); } return (Pairing *)ret; } } %typemap(out) blst::Pairing* { size_t sz = blst_pairing_sizeof(); size_t SZ = (sz + arg2->size() + sizeof(jlong) - 1)/sizeof(jlong); $result = JCALL(NewLongArray, SZ); if ($result != $null) JCALL(SetLongArrayRegion, $result, 0, SZ, (const jlong *)$1); } %typemap(newfree) blst::Pairing* "free($1);" %typemap(javaout) SELF* OUTPUT { $jnicall; return this; } %typemap(out) SELF* OUTPUT "(void)$1;" %typemap(jni) SELF* OUTPUT "void" %typemap(jtype) SELF* OUTPUT "void" #endif %typemap(throws) BLST_ERROR %{ SWIG_JavaThrowException(jenv, SWIG_JavaRuntimeException, BLST_ERROR_str[$1]); %} // handle input const byte[] more efficiently... %apply signed char[] { const byte* } %typemap(in) const byte* %{ $1 = $input ? ($1_ltype)JCALL(GetByteArrayElements, $input, 0) : NULL; %} %typemap(argout) const byte* "" %typemap(freearg) const byte* %{ if ($input) JCALL(ReleaseByteArrayElements, $input, (jbyte *)$1, JNI_ABORT); %} %apply const byte* { const byte[ANY] } %typemap(in) const byte[ANY] { size_t sz = JCALL(GetArrayLength, $input); if (sz != $1_dim0) { SWIG_JavaThrowException(jenv, SWIG_JavaIndexOutOfBoundsException, "BLST_ERROR: input size mismatch"); return $null; } $1 = ($1_ltype)JCALL(GetByteArrayElements, $input, 0); } // let users use 'java.math.BigInteger' as scalars %typemap(in) (const byte* scalar, size_t nbits) %{ $2 = JCALL(GetArrayLength, $input); $1 = ($1_ltype)alloca($2); JCALL(GetByteArrayRegion, $input, 0, $2, (jbyte*)$1); if (*(jbyte*)$1 < 0) { SWIG_JavaThrowException(jenv, SWIG_JavaIllegalArgumentException, "expecting unsigned value"); return $null; } { // BigInteger.toByteArray() emits big-endian, flip the order... size_t i, j; for(i=0, j=$2-1; i<$2/2; i++, j--) { $*1_ltype t=$1[i]; $1[i]=$1[j]; $1[j]=t; } } if ($1[$2-1] == 0) $2--; $2 *= 8; %} %typemap(jni) (const byte* scalar, size_t nbits) "jbyteArray" %typemap(jtype) (const byte* scalar, size_t nbits) "byte[]" %typemap(jstype) (const byte* scalar, size_t nbits) "java.math.BigInteger" %typemap(javain) (const byte* scalar, size_t nbits) "$javainput.toByteArray()" %typemap(jni) (const byte *STRING, size_t LENGTH) "jbyteArray" %typemap(jtype) (const byte *STRING, size_t LENGTH) "byte[]" %typemap(jstype) (const byte *STRING, size_t LENGTH) "byte[]" %typemap(javain) (const byte *STRING, size_t LENGTH) "$javainput" %typemap(freearg)(const byte *STRING, size_t LENGTH) "" %typemap(jni) blst::bytes_t "jbyteArray" %typemap(jtype) blst::bytes_t "byte[]" %typemap(jstype) blst::bytes_t "byte[]" %typemap(javain) blst::bytes_t "$javainput" %typemap(freearg)blst::bytes_t "" %typemap(in) blst::bytes_t %{ $1.ptr = (const byte*)JCALL(GetByteArrayElements, $input, 0); $1.len = JCALL(GetArrayLength, $input); %} %typemap(argout) blst::bytes_t %{ JCALL(ReleaseByteArrayElements, $input, (jbyte *)$1.ptr, JNI_ABORT); %} #elif defined(SWIGJAVASCRIPT) && defined(SWIG_JAVASCRIPT_V8) %header %{ #if V8_MAJOR_VERSION >= 8 # define GetData() GetBackingStore()->Data() #else # define GetData() GetContents().Data() #endif %} %typemap(throws) BLST_ERROR %{ SWIG_V8_Raise(BLST_ERROR_str[$1]); SWIG_fail; %} %typemap(in) const byte* %{ if ($input->IsArrayBufferView()) { auto av = v8::Local::Cast($input); auto buf = av->Buffer(); $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); } else if ($input->IsNull()) { $1 = nullptr; } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting "); } %} %typemap(argout) const byte* "" %typemap(freearg) const byte* "" %apply const byte* { const byte[ANY] } %typemap(in) const byte[ANY] %{ if ($input->IsArrayBufferView()) { auto av = v8::Local::Cast($input); if (av->ByteLength() != $1_dim0) SWIG_exception_fail(SWIG_IndexError, "in method '$symname', " "expecting $1_dim0 bytes"); auto buf = av->Buffer(); $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting "); } %} // let users use JavaScript and as scalars %typemap(in) (const byte* scalar, size_t nbits) %{ if ($input->IsArrayBufferView()) { auto av = v8::Local::Cast($input); auto buf = av->Buffer(); $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); $2 = 8*av->ByteLength(); #if V8_MAJOR_VERSION >=6 && V8_MINOR_VERSION >= 8 } else if ($input->IsBigInt()) { auto bi = v8::Local::Cast($input); int sign, word_count = bi->WordCount(); uint64_t* words = (uint64_t*)alloca($2 = word_count*sizeof(uint64_t)); bi->ToWordsArray(&sign, &word_count, words); if (sign) SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting unsigned value"); $1 = ($1_ltype)words; $2 *= 8; const union { long one; char little; } is_endian = { 1 }; if (!is_endian.little) { byte* p = $1; for (int i = 0; i < word_count; i++) { uint64_t val = words[i]; for (size_t j = 0; j < sizeof(val); j++, val >>= 8) *p++ = (byte)val; } } #endif } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting or "); } %} %typemap(in) (const byte *STRING, size_t LENGTH) %{ if ($input->IsArrayBufferView()) { auto av = v8::Local::Cast($input); auto buf = av->Buffer(); $1 = ($1_ltype)buf->GetData() + av->ByteOffset(); $2 = av->ByteLength(); } else if ($input->IsString()) { auto str = v8::Local::Cast($input); $2 = SWIGV8_UTF8_LENGTH(str); $1 = ($1_ltype)alloca($2); SWIGV8_WRITE_UTF8(str, (char *)$1, $2); } else if ($input->IsNull()) { $1 = nullptr; $2 = 0; } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting or "); } %} %typemap(freearg) (const byte *STRING, size_t LENGTH) "" %typemap(in) blst::bytes_t %{ if ($input->IsArrayBufferView()) { auto av = v8::Local::Cast($input); auto buf = av->Buffer(); $1.ptr = (byte*)buf->GetData() + av->ByteOffset(); $1.len = av->ByteLength(); } else if ($input->IsString()) { auto str = v8::Local::Cast($input); $1.len = SWIGV8_UTF8_LENGTH(str); $1.ptr = (byte*)alloca($1.len); SWIGV8_WRITE_UTF8(str, (char *)$1.ptr, $1.len); } else if ($input->IsNull()) { $1.ptr = nullptr; $1.len = 0; } else { SWIG_exception_fail(SWIG_TypeError, "in method '$symname', " "expecting or "); } %} %typemap(freearg) blst::bytes_t "" // return |this| %typemap(out) SELF* OUTPUT %{ (void)$1; $result = args.Holder(); %} #elif defined(SWIGPERL) // let users use byte[] as scalars %apply (const char *STRING, size_t LENGTH) { (const byte* scalar, size_t nbits) } %typemap(check) (const byte* scalar, size_t nbits) %{ $2 *= 8; %} #ifdef __cplusplus // return |this| %typemap(out) SELF* OUTPUT %{ (void)$1; argvi++; %} #endif #endif // SWIG // everybody has a way to bundle pointer and buffer size, but C:-( %apply (const byte *STRING, size_t LENGTH) { (const byte *msg, size_t msg_len), (const byte *DST, size_t DST_len), (const byte *aug, size_t aug_len), (const byte *IKM, size_t IKM_len), (const byte *info, size_t info_len), (const byte *salt, size_t salt_len), (const byte *in, size_t len) } // some sorcery to return byte[] from serialization methods %typemap(in, numinputs=0) byte out[ANY] (byte temp[$1_dim0]) %{ $1 = temp; %} %typemap(argout) byte out[ANY] { #if defined(SWIGPYTHON) PyObject *obj = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); $result = SWIG_AppendOutput($result, obj); #elif defined(SWIGJAVA) $result = JCALL(NewByteArray, $1_dim0); if ($result != $null) { JCALL(SetByteArrayRegion, $result, 0, $1_dim0, (const jbyte *)$1); } #elif defined(SWIGJAVASCRIPT) && defined(SWIG_JAVASCRIPT_V8) auto ab = v8::ArrayBuffer::New(v8::Isolate::GetCurrent(), $1_dim0); memcpy(ab->GetData(), $1, $1_dim0); $result = v8::Uint8Array::New(ab, 0, $1_dim0); #elif defined(SWIGPERL) $result = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); argvi++; #else // TODO: figure out more language-specific ways to return multi-values... if ($result == NULL) $result = SWIG_FromCharPtrAndSize((char *)$1, $1_dim0); #endif } %typemap(freearg) byte out[ANY] "" #ifdef SWIGJAVA %typemap(jni) byte out[ANY] "jbyteArray" %typemap(jtype) byte out[ANY] "byte[]" %typemap(jstype) byte out[ANY] "byte[]" %typemap(javaout) byte out[ANY] { return $jnicall; } #endif %apply byte out[ANY] { void to_bendian, void blst_bendian_from_scalar, void to_lendian, void blst_lendian_from_scalar, void serialize, void blst_p1_serialize, void blst_p1_affine_serialize, void blst_p2_serialize, void blst_p2_affine_serialize, void compress, void blst_p1_compress, void blst_p1_affine_compress, void blst_p2_compress, void blst_p2_affine_compress, void blst_sk_to_pk2_in_g1, void blst_sign_pk2_in_g1, void blst_sk_to_pk2_in_g2, void blst_sign_pk2_in_g2 } #ifdef __cplusplus %apply const std::string& { const std::string* } #pragma SWIG nowarn=509,516 #if !defined(SWIGPYTHON) %ignore P1_Affines; %ignore P2_Affines; #endif %ignore nullptr; %ignore None; %ignore C_bytes; %ignore bytes_t; %feature("novaluewrapper") bytes_t; %catches(BLST_ERROR) P1(const byte* in, size_t len); %catches(BLST_ERROR) P1_Affine(const byte* in, size_t len); %catches(BLST_ERROR) aggregate(const P1_Affine& in); %catches(BLST_ERROR) P2(const byte* in, size_t len); %catches(BLST_ERROR) P2_Affine(const byte* in, size_t len); %catches(BLST_ERROR) aggregate(const P2_Affine& in); %catches(BLST_ERROR) blst::Scalar::add; %catches(BLST_ERROR) blst::Scalar::sub; %catches(BLST_ERROR) blst::Scalar::mul; // methods returning |this| %apply SELF* OUTPUT { blst::P1* sign_with, blst::P2* sign_with, blst::P1* hash_to, blst::P2* hash_to, blst::P1* encode_to, blst::P2* encode_to, blst::P1* mult, blst::P2* mult, blst::P1* cneg, blst::P2* cneg, blst::P1* neg, blst::P2* neg, blst::P1* add, blst::P2* add, blst::P1* dbl, blst::P2* dbl, blst::PT* mul, blst::PT* sqr, blst::PT* final_exp, blst::Scalar* from_bendian, blst::Scalar* from_lendian, blst::Scalar* add, blst::Scalar* sub, blst::Scalar* mul, blst::Scalar* inverse } typedef enum { BLST_SUCCESS = 0, BLST_BAD_ENCODING, BLST_POINT_NOT_ON_CURVE, BLST_POINT_NOT_IN_GROUP, BLST_AGGR_TYPE_MISMATCH, BLST_VERIFY_FAIL, BLST_PK_IS_INFINITY, } BLST_ERROR; %include "blst.hpp" extern const blst::P1_Affine BLS12_381_G1; extern const blst::P1_Affine BLS12_381_NEG_G1; extern const blst::P2_Affine BLS12_381_G2; extern const blst::P2_Affine BLS12_381_NEG_G2; #else %ignore blst_fr; %ignore blst_fp; %ignore blst_fp2; %ignore blst_fp6; %ignore blst_scalar_from_uint32; %ignore blst_scalar_from_uint64; %ignore blst_uint32_from_scalar; %ignore blst_uint64_from_scalar; %ignore blst_pairing_init; %ignore blst_pairing_get_dst; %include "blst.h" %include "blst_aux.h" %extend blst_pairing { blst_pairing(bool hash_or_encode, const byte *DST DEFNULL, size_t DST_len DEFNULL) { void *ret = malloc(blst_pairing_sizeof()); if (DST_len != 0) { void *dst = malloc(DST_len); memcpy(dst, DST, DST_len); blst_pairing_init(ret, hash_or_encode, dst, DST_len); } else { blst_pairing_init(ret, hash_or_encode, NULL, 0); } return ret; } ~blst_pairing() { void *dst = (void *)blst_pairing_get_dst($self); if (dst != NULL) free(dst); free($self); } } #endif %begin %{ #ifdef __cplusplus # include # include "blst.hpp" using namespace blst; #else # include "blst.h" #endif static const char *const BLST_ERROR_str [] = { "BLST_ERROR: success", "BLST_ERROR: bad point encoding", "BLST_ERROR: point is not on curve", "BLST_ERROR: point is not in group", "BLST_ERROR: context type mismatch", "BLST_ERROR: verify failed", "BLST_ERROR: public key is infinite", }; #define SWIG_PYTHON_STRICT_BYTE_CHAR #if defined(__GNUC__) # ifndef alloca # define alloca(s) __builtin_alloca(s) # endif #elif defined(__sun) # include #elif defined(_WIN32) # include # ifndef alloca # define alloca(s) _alloca(s) # endif #endif %} #if defined(SWIGPYTHON) || defined(SWIGPERL) %include "cdata.i" #endif #if SWIG_VERSION < 0x040100 && defined(SWIGJAVASCRIPT) %wrapper %{ #ifdef NODE_MODULE # undef NODE_MODULE # define NODE_MODULE NODE_MODULE_CONTEXT_AWARE // actually error-prone and not exactly suitable for production, but // sufficient for development purposes till SWIG 4.1.0 is released... #endif %} #endif #if SWIG_VERSION < 0x040100 && defined(SWIGJAVA) /* SWIG versions prior 4.1 were crossing the MinGW's ways on the path * to JNI 'jlong' type */ %begin %{ #if defined(__MINGW32__) && defined(__int64) # undef __int64 #endif %} #endif ================================================ FILE: bindings/blst_aux.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLST_AUX_H__ #define __BLST_AUX_H__ /* * This file lists interfaces that might be promoted to blst.h or removed, * depending on their proven/unproven worthiness. */ void blst_fr_ct_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); void blst_fr_gs_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); void blst_fr_to(blst_fr *ret, const blst_fr *a); void blst_fr_from(blst_fr *ret, const blst_fr *a); #ifdef BLST_FR_PENTAROOT void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); #endif void blst_fp_to(blst_fp *ret, const blst_fp *a); void blst_fp_from(blst_fp *ret, const blst_fp *a); bool blst_fp_is_square(const blst_fp *a); bool blst_fp2_is_square(const blst_fp2 *a); void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); /* * Below functions produce both point and deserialized outcome of * SkToPk and Sign. However, deserialized outputs are pre-decorated * with sign and infinity bits. This means that you have to bring the * output into compliance prior returning to application. If you want * compressed point value, then do [equivalent of] * * byte temp[96]; * blst_sk_to_pk2_in_g1(temp, out_pk, SK); * temp[0] |= 0x80; * memcpy(out, temp, 48); * * Otherwise do * * blst_sk_to_pk2_in_g1(out, out_pk, SK); * out[0] &= ~0x20; * * Either |out| or |out_| can be NULL. */ void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, const blst_scalar *SK); void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, const blst_p2 *hash, const blst_scalar *SK); void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, const blst_scalar *SK); void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, const blst_p1 *hash, const blst_scalar *SK); #ifdef __BLST_RUST_BINDGEN__ typedef struct {} blst_uniq; #else typedef struct blst_opaque blst_uniq; #endif size_t blst_uniq_sizeof(size_t n_nodes); void blst_uniq_init(blst_uniq *tree); bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); #ifdef expand_message_xmd void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, const unsigned char *aug, size_t aug_len, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len); #else void blst_expand_message_xmd(byte *out, size_t out_len, const byte *msg, size_t msg_len, const byte *DST, size_t DST_len); #endif void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, size_t nbits); void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, size_t nbits); void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, const blst_p1_affine *p); blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a); void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, const byte *info DEFNULL, size_t info_len DEFNULL); void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, const byte *salt, size_t salt_len, const byte *info DEFNULL, size_t info_len DEFNULL); void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, const byte *salt, size_t salt_len, const byte *info DEFNULL, size_t info_len DEFNULL); void blst_derive_master_eip2333(blst_scalar *out_SK, const byte *IKM, size_t IKM_len); void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK, uint32_t child_index); void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex); void blst_fr_from_hexascii(blst_fr *ret, const byte *hex); void blst_fp_from_hexascii(blst_fp *ret, const byte *hex); size_t blst_p1_sizeof(void); size_t blst_p1_affine_sizeof(void); size_t blst_p2_sizeof(void); size_t blst_p2_affine_sizeof(void); size_t blst_fp12_sizeof(void); void blst_fp_from_le_bytes(blst_fp *ret, const byte *in, size_t len); void blst_fp_from_be_bytes(blst_fp *ret, const byte *in, size_t len); /* * Single-shot SHA-256 hash function. */ void blst_sha256(byte out[32], const byte *msg, size_t msg_len); #endif ================================================ FILE: bindings/c#/poc.cs ================================================ using System; using System.Text; using supranational; class PoC { private static void Main(string[] args) { var msg = Encoding.UTF8.GetBytes("assertion"); var DST = "MY-DST"; var SK = new blst.SecretKey(); SK.keygen(Encoding.UTF8.GetBytes(new string('*', 32))); // generate public key and serialize it... var pk_for_wire = new blst.P1(SK).serialize(); // sign |msg| and serialize the signature... var sig_for_wire = new blst.P2().hash_to(msg, DST, pk_for_wire) .sign_with(SK) .serialize(); // now on "receiving" side, start with deserialization... var _sig = new blst.P2_Affine(sig_for_wire); var _pk = new blst.P1_Affine(pk_for_wire); if (!_pk.in_group()) throw new blst.Exception(blst.ERROR.POINT_NOT_IN_GROUP); var ctx = new blst.Pairing(true, DST); var err = ctx.aggregate(_pk, _sig, msg, pk_for_wire); if (err != blst.ERROR.SUCCESS) throw new blst.Exception(err); ctx.commit(); if (!ctx.finalverify()) throw new blst.Exception(blst.ERROR.VERIFY_FAIL); Console.WriteLine("OK"); // exercise .as_fp12 by performing equivalent of ctx.finalverify above var C1 = new blst.PT(_sig); var C2 = ctx.as_fp12(); if (!blst.PT.finalverify(C1, C2)) throw new blst.Exception(blst.ERROR.VERIFY_FAIL); // test integers as scalar multiplicands var p = blst.G1(); var q = p.dup().dbl().dbl().add(p); if (!p.mult(5).is_equal(q)) throw new ApplicationException("disaster"); if (!blst.G1().mult(-5).is_equal(q.neg())) throw new ApplicationException("disaster"); // low-order sanity check var p11 = new blst.P1(fromHexString("80803f0d09fec09a95f2ee7495323c15c162270c7cceaffa8566e941c66bcf206e72955d58b3b32e564de3209d672ca5")); if (p11.in_group()) throw new ApplicationException("disaster"); if (!p11.mult(11).is_inf()) throw new ApplicationException("disaster"); } private static int fromHexChar(char c) { if (c>='0' && c<='9') return c - '0'; else if (c>='a' && c<='f') return c - 'a' + 10; else if (c>='A' && c<='F') return c - 'A' + 10; throw new ArgumentOutOfRangeException("non-hex character"); } private static byte[] fromHexString(string str) { if (str.Length%2 != 0) throw new ArgumentException("odd number of characters in hex string"); char[] hex = str.ToCharArray(); byte[] ret = new byte[hex.Length/2]; for (int i=0; i Exe net8.0 CS8981 ================================================ FILE: bindings/c#/run.me ================================================ #!/usr/bin/env python3 # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 import os import re import sys import glob import subprocess top = """ using System; using System.Text; using System.Numerics; using System.Runtime.InteropServices; using size_t = System.UIntPtr; #if NET5_0_OR_GREATER using System.Runtime.Loader; using System.Reflection; using System.IO; #endif namespace supranational { public static class blst { #if NET5_0_OR_GREATER private static readonly string dll; static blst() { if (String.IsNullOrEmpty(dll)) { var name = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "blst.dll" : RuntimeInformation.IsOSPlatform(OSPlatform.OSX) ? "libblst.dll.dylib" : "libblst.dll.so"; var dir = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); var arch = RuntimeInformation.ProcessArchitecture switch { Architecture.X64 => "x64", Architecture.Arm64 => "arm64", _ => "unsupported" }; #if NET8_0_OR_GREATER // RuntimeInformation.RuntimeIdentifier changed between .NET 7 and 8 // and only aligns to the nuget layout in 8+ var rid = RuntimeInformation.RuntimeIdentifier; #else // Mimic pre-8 RuntimeInformation.RuntimeIdentifier as // "win-x64", "linux-x64", "linux-arm64", "osx-x64", etc. var os = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "win" : RuntimeInformation.IsOSPlatform(OSPlatform.OSX) ? "osx" : RuntimeInformation.IsOSPlatform(OSPlatform.FreeBSD) ? "freebsd" : "linux"; var rid = $"{os}-{arch}"; #endif // first look for the file in the standard locations for a nuget installed native lib dll = Path.Combine(dir, "runtimes", rid, "native", name); if (!File.Exists(dll)) dll = Path.Combine(dir, arch, name); // try the original non-standard location if (!File.Exists(dll)) dll = Path.Combine(Environment.CurrentDirectory, name); if (File.Exists(dll)) { AssemblyLoadContext.Default.ResolvingUnmanagedDll += (asm, needs) => (needs == "blst.dll" ? NativeLibrary.Load(dll) : IntPtr.Zero); } } } #endif public enum ERROR { SUCCESS = 0, BAD_ENCODING, POINT_NOT_ON_CURVE, POINT_NOT_IN_GROUP, AGGR_TYPE_MISMATCH, VERIFY_FAIL, PK_IS_INFINITY, BAD_SCALAR, } public class Exception : ApplicationException { private readonly ERROR code; public Exception(ERROR err) { code = err; } public override string Message { get { switch(code) { case ERROR.BAD_ENCODING: return "bad encoding"; case ERROR.POINT_NOT_ON_CURVE: return "point not on curve"; case ERROR.POINT_NOT_IN_GROUP: return "point not in group"; case ERROR.AGGR_TYPE_MISMATCH: return "aggregate type mismatch"; case ERROR.VERIFY_FAIL: return "verify failure"; case ERROR.PK_IS_INFINITY: return "public key is infinity"; case ERROR.BAD_SCALAR: return "bad scalar"; default: return null; } } } } public enum ByteOrder { BigEndian, LittleEndian, } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen_v3([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen_v4_5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] salt, size_t salt_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen_v5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] salt, size_t salt_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_derive_master_eip2333([Out] byte[] key, [In] byte[] IKM, size_t IKM_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_derive_child_eip2333([Out] byte[] key, [In] byte[] master, uint child_index); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_bendian([Out] byte[] ret, [In] byte[] key); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_bendian_from_scalar([Out] byte[] ret, [In] byte[] key); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_check([In] byte[] key); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_lendian([Out] byte[] key, [In] byte[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_lendian_from_scalar([Out] byte[] key, [In] byte[] inp); public struct SecretKey { internal byte[] key; //public SecretKey() { key = new byte[32]; } public SecretKey(byte[] IKM, string info) { key = new byte[32]; keygen(IKM, info); } public SecretKey(byte[] inp, ByteOrder order=ByteOrder.BigEndian) { key = new byte[32]; switch(order) { case ByteOrder.BigEndian: from_bendian(inp); break; case ByteOrder.LittleEndian: from_lendian(inp); break; } } public void keygen(byte[] IKM, string info="") { if (key == null) key = new byte[32]; byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen(key, IKM, (size_t)IKM.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v3(byte[] IKM, string info="") { if (key == null) key = new byte[32]; byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen_v3(key, IKM, (size_t)IKM.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v4_5(byte[] IKM, string salt, string info="") { if (key == null) key = new byte[32]; byte[] salt_bytes = Encoding.UTF8.GetBytes(salt); byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen_v4_5(key, IKM, (size_t)IKM.Length, salt_bytes, (size_t)salt_bytes.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v5(byte[] IKM, byte[] salt, string info="") { if (key == null) key = new byte[32]; byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen_v5(key, IKM, (size_t)IKM.Length, salt, (size_t)salt.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v5(byte[] IKM, string salt, string info="") { keygen_v5(IKM, Encoding.UTF8.GetBytes(salt), info); } public void derive_master_eip2333(byte[] IKM) { if (key == null) key = new byte[32]; blst_derive_master_eip2333(key, IKM, (size_t)IKM.Length); } public SecretKey(SecretKey master, uint child_index) { key = new byte[32]; blst_derive_child_eip2333(key, master.key, child_index); } public void from_bendian(byte[] inp) { if (inp.Length != 32) throw new Exception(ERROR.BAD_ENCODING); if (key == null) key = new byte[32]; blst_scalar_from_bendian(key, inp); if (!blst_sk_check(key)) throw new Exception(ERROR.BAD_ENCODING); } public void from_lendian(byte[] inp) { if (inp.Length != 32) throw new Exception(ERROR.BAD_ENCODING); if (key == null) key = new byte[32]; blst_scalar_from_lendian(key, inp); if (!blst_sk_check(key)) throw new Exception(ERROR.BAD_ENCODING); } public byte[] to_bendian() { byte[] ret = new byte[32]; blst_bendian_from_scalar(ret, key); return ret; } public byte[] to_lendian() { byte[] ret = new byte[32]; blst_lendian_from_scalar(ret, key); return ret; } } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_be_bytes([Out] byte[] ret, [In] byte[] inp, size_t inp_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_le_bytes([Out] byte[] ret, [In] byte[] inp, size_t inp_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_add_n_check([Out] byte[] ret, [In] byte[] a, [In] byte[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_sub_n_check([Out] byte[] ret, [In] byte[] a, [In] byte[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_mul_n_check([Out] byte[] ret, [In] byte[] a, [In] byte[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sk_inverse([Out] byte[] ret, [In] byte[] a); public struct Scalar { internal byte[] val; //public Scalar() { val = new byte[32]; } public Scalar(byte[] inp, ByteOrder order=ByteOrder.BigEndian) { val = new byte[32]; switch(order) { case ByteOrder.BigEndian: from_bendian(inp); break; case ByteOrder.LittleEndian: from_lendian(inp); break; } } private Scalar(bool _) { val = new byte[32]; } private Scalar(Scalar orig) { val = (byte[])orig.val.Clone(); } public Scalar dup() { return new Scalar(this); } public void from_bendian(byte[] inp) { if (val == null) val = new byte[32]; blst_scalar_from_be_bytes(val, inp, (size_t)inp.Length); } public void from_lendian(byte[] inp) { if (val == null) val = new byte[32]; blst_scalar_from_le_bytes(val, inp, (size_t)inp.Length); } public byte[] to_bendian() { byte[] ret = new byte[32]; blst_bendian_from_scalar(ret, val); return ret; } public byte[] to_lendian() { byte[] ret = new byte[32]; blst_lendian_from_scalar(ret, val); return ret; } public Scalar add(SecretKey a) { if (!blst_sk_add_n_check(val, val, a.key)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar add(Scalar a) { if (!blst_sk_add_n_check(val, val, a.val)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar sub(Scalar a) { if (!blst_sk_sub_n_check(val, val, a.val)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar mul(Scalar a) { if (!blst_sk_mul_n_check(val, val, a.val)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar inverse() { blst_sk_inverse(val, val); return this; } public static Scalar operator+(Scalar a, Scalar b) { return a.dup().add(b); } public static Scalar operator-(Scalar a, Scalar b) { return a.dup().sub(b); } public static Scalar operator*(Scalar a, Scalar b) { return a.dup().mul(b); } public static Scalar operator/(Scalar a, Scalar b) { return b.dup().inverse().mul(a); } } private const int P1_COMPRESSED_SZ = 384/8; private const int P2_COMPRESSED_SZ = 2*P1_COMPRESSED_SZ; """ middle = """ [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_p1_affine_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_p1_deserialize([Out] long[] ret, [In] byte[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_affine_serialize([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_affine_compress([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_to_affine([Out] long[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_on_curve([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_in_g1([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_is_inf([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern IntPtr blst_p1_generator(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_core_verify_pk_in_g2([In] long[] pk, [In] long[] sig, bool hash_or_encode, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); public struct P1_Affine { internal readonly long[] point; private static readonly int sz = (int)blst_p1_affine_sizeof()/sizeof(long); //public P1_Affine() { point = new long[sz]; } private P1_Affine(bool _) { point = new long[sz]; } private P1_Affine(P1_Affine p) { point = (long[])p.point.Clone(); } public P1_Affine(byte[] inp) : this(true) { int len = inp.Length; if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ : 2*P1_COMPRESSED_SZ)) throw new Exception(ERROR.BAD_ENCODING); ERROR err = blst_p1_deserialize(point, inp); if (err != ERROR.SUCCESS) throw new Exception(err); } public P1_Affine(P1 jacobian) : this(true) { blst_p1_to_affine(point, jacobian.point); } public P1_Affine dup() { return new P1_Affine(this); } public P1 to_jacobian() { return new P1(this); } public byte[] serialize() { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; blst_p1_affine_serialize(ret, point); return ret; } public byte[] compress() { byte[] ret = new byte[P1_COMPRESSED_SZ]; blst_p1_affine_compress(ret, point); return ret; } public bool on_curve() { return blst_p1_affine_on_curve(point); } public bool in_group() { return blst_p1_affine_in_g1(point); } public bool is_inf() { return blst_p1_affine_is_inf(point); } public bool is_equal(P1_Affine p) { return blst_p1_affine_is_equal(point, p.point); } ERROR core_verify(P2_Affine pk, bool hash_or_encode, byte[] msg, string DST = "", byte[] aug = null) { byte[] dst = Encoding.UTF8.GetBytes(DST); return blst_core_verify_pk_in_g2(pk.point, point, hash_or_encode, msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public static P1_Affine generator() { var ret = new P1_Affine(true); Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); return ret; } } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_p1_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_serialize([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_compress([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_from_affine([Out] long[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_on_curve([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_in_g1([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_is_inf([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sk_to_pk_in_g1([Out] long[] ret, [In] byte[] SK); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_encode_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_hash_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sign_pk_in_g2([Out] long[] ret, [In] long[] hash, [In] byte[] SK); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_mult([Out] long[] ret, [In] long[] a, [In] byte[] scalar, size_t nbits); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_cneg([Out] long[] ret, bool cbit); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_add_or_double([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_add_or_double_affine([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_double([Out] long[] ret, [In] long[] a); public struct P1 { internal long[] point; private static readonly int sz = (int)blst_p1_sizeof()/sizeof(long); //public P1() { point = new long[sz]; } private P1(bool _) { point = new long[sz]; } private P1(P1 p) { point = (long[])p.point.Clone(); } private long[] self() { if (point==null) { point = new long[sz]; } return point; } public P1(SecretKey sk) : this(true) { blst_sk_to_pk_in_g1(point, sk.key); } public P1(byte[] inp) : this(true) { int len = inp.Length; if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ : 2*P1_COMPRESSED_SZ)) throw new Exception(ERROR.BAD_ENCODING); ERROR err = blst_p1_deserialize(point, inp); if (err != ERROR.SUCCESS) throw new Exception(err); blst_p1_from_affine(point, point); } public P1(P1_Affine affine) : this(true) { blst_p1_from_affine(point, affine.point); } public P1 dup() { return new P1(this); } public P1_Affine to_affine() { return new P1_Affine(this); } public byte[] serialize() { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; blst_p1_serialize(ret, point); return ret; } public byte[] compress() { byte[] ret = new byte[P1_COMPRESSED_SZ]; blst_p1_compress(ret, point); return ret; } public bool on_curve() { return blst_p1_on_curve(point); } public bool in_group() { return blst_p1_in_g1(point); } public bool is_inf() { return blst_p1_is_inf(point); } public bool is_equal(P1 p) { return blst_p1_is_equal(point, p.point); } public P1 hash_to(byte[] msg, string DST="", byte[] aug=null) { byte[] dst = Encoding.UTF8.GetBytes(DST); blst_hash_to_g1(self(), msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); return this; } public P1 encode_to(byte[] msg, string DST="", byte[] aug=null) { byte[] dst = Encoding.UTF8.GetBytes(DST); blst_encode_to_g1(self(), msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); return this; } public P1 sign_with(SecretKey sk) { blst_sign_pk_in_g2(point, point, sk.key); return this; } public P1 sign_with(Scalar scalar) { blst_sign_pk_in_g2(point, point, scalar.val); return this; } public void aggregate(P1_Affine inp) { if (blst_p1_affine_in_g1(inp.point)) blst_p1_add_or_double_affine(point, point, inp.point); else throw new Exception(ERROR.POINT_NOT_IN_GROUP); } public P1 mult(byte[] scalar) { blst_p1_mult(point, point, scalar, (size_t)(scalar.Length*8)); return this; } public P1 mult(Scalar scalar) { blst_p1_mult(point, point, scalar.val, (size_t)255); return this; } public P1 mult(BigInteger scalar) { byte[] val; if (scalar.Sign < 0) { val = BigInteger.Negate(scalar).ToByteArray(); blst_p1_cneg(point, true); } else { val = scalar.ToByteArray(); } int len = val.Length; if (val[len-1]==0) len--; blst_p1_mult(point, point, val, (size_t)(len*8)); return this; } public P1 cneg(bool flag) { blst_p1_cneg(point, flag); return this; } public P1 neg() { blst_p1_cneg(point, true); return this; } public P1 add(P1 a) { blst_p1_add_or_double(point, point, a.point); return this; } public P1 add(P1_Affine a) { blst_p1_add_or_double_affine(point, point, a.point); return this; } public P1 dbl() { blst_p1_double(point, point); return this; } public static P1 generator() { var ret = new P1(true); Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); return ret; } } public static P1 G1() { return P1.generator(); } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_aggregated_in_g1([Out] long[] fp12, [In] long[] p); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_aggregate_pk_in_g1([In, Out] long[] fp12, [In] long[] pk, [In] long[] sig, [In] byte[] msg, size_t msg_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_mul_n_aggregate_pk_in_g1([In, Out] long[] fp12, [In] long[] pk, [In] long[] sig, [In] byte[] scalar, size_t nbits, [In] byte[] msg, size_t msg_len, [In] byte[] aug, size_t aug_len); """ bottom = """ [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_fp12_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_miller_loop([Out] long[] fp12, [In] long[] q, [In] long[] p); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_is_one([In] long[] fp12); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_fp12_sqr([Out] long[] ret, [In] long[] a); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_fp12_mul([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_final_exp([Out] long[] ret, [In] long[] a); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_finalverify([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern IntPtr blst_fp12_one(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_in_group([In] long[] a); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_bendian_from_fp12([Out] byte[] ret, [In] long[] a); public struct PT { internal readonly long[] fp12; private static readonly int sz = (int)blst_fp12_sizeof()/sizeof(long); internal PT(bool _) { fp12 = new long[sz]; } private PT(PT orig) { fp12 = (long[])orig.fp12.Clone(); } public PT(P1_Affine p) : this(true) { blst_aggregated_in_g1(fp12, p.point); } public PT(P1 p) : this(true) { blst_aggregated_in_g1(fp12, (new P1_Affine(p)).point); } public PT(P2_Affine q) : this(true) { blst_aggregated_in_g2(fp12, q.point); } public PT(P2 q) : this(true) { blst_aggregated_in_g2(fp12, (new P2_Affine(q)).point); } public PT(P2_Affine q, P1_Affine p) : this(true) { blst_miller_loop(fp12, q.point, p.point); } public PT(P1_Affine p, P2_Affine q) : this(q, p) {} public PT(P2 q, P1 p) : this(true) { blst_miller_loop(fp12, (new P2_Affine(q)).point, (new P1_Affine(p)).point); } public PT(P1 p, P2 q) : this(q, p) {} public PT dup() { return new PT(this); } public bool is_one() { return blst_fp12_is_one(fp12); } public bool is_equal(PT p) { return blst_fp12_is_equal(fp12, p.fp12); } public PT sqr() { blst_fp12_sqr(fp12, fp12); return this; } public PT mul(PT p) { blst_fp12_mul(fp12, fp12, p.fp12); return this; } public PT final_exp() { blst_final_exp(fp12, fp12); return this; } public bool in_group() { return blst_fp12_in_group(fp12); } public byte[] to_bendian() { byte[] ret = new byte[12*P1_COMPRESSED_SZ]; blst_bendian_from_fp12(ret, fp12); return ret; } public static bool finalverify(PT gt1, PT gt2) { return blst_fp12_finalverify(gt1.fp12, gt2.fp12); } public static PT one() { var ret = new PT(true); Marshal.Copy(blst_fp12_one(), ret.fp12, 0, ret.fp12.Length); return ret; } } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_pairing_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_pairing_init([In, Out] long[] ctx, bool hash_or_encode, [In] ref long dst, size_t dst_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_pairing_commit([In, Out] long[] ctx); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_merge([In, Out] long[] ctx, [In] long[] ctx1); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_pairing_finalverify([In] long[] ctx, [In] long[] sig); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_pairing_raw_aggregate([In, Out] long[] ctx, [In] long[] q, [In] long[] p); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern IntPtr blst_pairing_as_fp12([In] long[] ctx); public struct Pairing { private readonly long[] ctx; private static readonly int sz = (int)blst_pairing_sizeof()/sizeof(long); public Pairing(bool hash_or_encode=false, string DST="") { byte[] dst = Encoding.UTF8.GetBytes(DST); int dst_len = dst.Length; int add_len = dst_len!=0 ? (dst_len+sizeof(long)-1)/sizeof(long) : 1; Array.Resize(ref dst, add_len*sizeof(long)); ctx = new long[sz+add_len]; for (int i=0; i sig, byte[] msg, byte[] aug=null) { return blst_pairing_aggregate_pk_in_g1(ctx, pk.point, sig.HasValue ? sig.Value.point : null, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public ERROR aggregate(P2_Affine pk, Nullable sig, byte[] msg, byte[] aug=null) { return blst_pairing_aggregate_pk_in_g2(ctx, pk.point, sig.HasValue ? sig.Value.point : null, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public ERROR mul_n_aggregate(P2_Affine pk, P1_Affine sig, byte[] scalar, int nbits, byte[] msg, byte[] aug=null) { return blst_pairing_mul_n_aggregate_pk_in_g2(ctx, pk.point, sig.point, scalar, (size_t)nbits, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public ERROR mul_n_aggregate(P1_Affine pk, P2_Affine sig, byte[] scalar, int nbits, byte[] msg, byte[] aug=null) { return blst_pairing_mul_n_aggregate_pk_in_g1(ctx, pk.point, sig.point, scalar, (size_t)nbits, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public void commit() { blst_pairing_commit(ctx); } public void merge(Pairing a) { var err = blst_pairing_merge(ctx, a.ctx); if (err != ERROR.SUCCESS) throw new Exception(err); } public bool finalverify(PT sig=new PT()) { return blst_pairing_finalverify(ctx, sig.fp12); } public void raw_aggregate(P2_Affine q, P1_Affine p) { blst_pairing_raw_aggregate(ctx, q.point, p.point); } public void raw_aggregate(P1_Affine p, P2_Affine q) { raw_aggregate(q, p); } public void raw_aggregate(P2 q, P1 p) { blst_pairing_raw_aggregate(ctx, (new P2_Affine(q)).point, (new P1_Affine(p)).point); } public void raw_aggregate(P1 p, P2 q) { raw_aggregate(q, p); } public PT as_fp12() { var ret = new PT(true); GCHandle h = GCHandle.Alloc(ctx, GCHandleType.Pinned); Marshal.Copy(blst_pairing_as_fp12(ctx), ret.fp12, 0, ret.fp12.Length); h.Free(); return ret; } } }}""" here = re.split(r'[/\\](?=[^/\\]*$)', sys.argv[0]) if len(here) > 1: os.chdir(here[0]) def xchg_1vs2(matchobj): if matchobj.group(2) == '1': return matchobj.group(1) + '2' else: return matchobj.group(1) + '1' def newer(files): if len(files) == 1: return True rh = files[-1] if not os.path.exists(rh): return True for lh in files[:-1]: if os.stat(lh).st_ctime > os.stat(rh).st_ctime: return True return False fname = "supranational.blst.cs" if newer([here[-1], fname]): fd = open(fname, "w") print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fd) print("// DO NOT EDIT THIS FILE!!!", file=fd) print("// The file is auto-generated by " + here[-1], file=fd) print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fd) print("\n\n", file=fd) print(top, file=fd) print(middle, file=fd) print(re.sub(r'((? "x64", Architecture.Arm64 => "arm64", _ => "unsupported" }; #if NET8_0_OR_GREATER // RuntimeInformation.RuntimeIdentifier changed between .NET 7 and 8 // and only aligns to the nuget layout in 8+ var rid = RuntimeInformation.RuntimeIdentifier; #else // Mimic pre-8 RuntimeInformation.RuntimeIdentifier as // "win-x64", "linux-x64", "linux-arm64", "osx-x64", etc. var os = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "win" : RuntimeInformation.IsOSPlatform(OSPlatform.OSX) ? "osx" : RuntimeInformation.IsOSPlatform(OSPlatform.FreeBSD) ? "freebsd" : "linux"; var rid = $"{os}-{arch}"; #endif // first look for the file in the standard locations for a nuget installed native lib dll = Path.Combine(dir, "runtimes", rid, "native", name); if (!File.Exists(dll)) dll = Path.Combine(dir, arch, name); // try the original non-standard location if (!File.Exists(dll)) dll = Path.Combine(Environment.CurrentDirectory, name); if (File.Exists(dll)) { AssemblyLoadContext.Default.ResolvingUnmanagedDll += (asm, needs) => (needs == "blst.dll" ? NativeLibrary.Load(dll) : IntPtr.Zero); } } } #endif public enum ERROR { SUCCESS = 0, BAD_ENCODING, POINT_NOT_ON_CURVE, POINT_NOT_IN_GROUP, AGGR_TYPE_MISMATCH, VERIFY_FAIL, PK_IS_INFINITY, BAD_SCALAR, } public class Exception : ApplicationException { private readonly ERROR code; public Exception(ERROR err) { code = err; } public override string Message { get { switch(code) { case ERROR.BAD_ENCODING: return "bad encoding"; case ERROR.POINT_NOT_ON_CURVE: return "point not on curve"; case ERROR.POINT_NOT_IN_GROUP: return "point not in group"; case ERROR.AGGR_TYPE_MISMATCH: return "aggregate type mismatch"; case ERROR.VERIFY_FAIL: return "verify failure"; case ERROR.PK_IS_INFINITY: return "public key is infinity"; case ERROR.BAD_SCALAR: return "bad scalar"; default: return null; } } } } public enum ByteOrder { BigEndian, LittleEndian, } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen_v3([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen_v4_5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] salt, size_t salt_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_keygen_v5([Out] byte[] key, [In] byte[] IKM, size_t IKM_len, [In] byte[] salt, size_t salt_len, [In] byte[] info, size_t info_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_derive_master_eip2333([Out] byte[] key, [In] byte[] IKM, size_t IKM_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_derive_child_eip2333([Out] byte[] key, [In] byte[] master, uint child_index); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_bendian([Out] byte[] ret, [In] byte[] key); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_bendian_from_scalar([Out] byte[] ret, [In] byte[] key); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_check([In] byte[] key); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_lendian([Out] byte[] key, [In] byte[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_lendian_from_scalar([Out] byte[] key, [In] byte[] inp); public struct SecretKey { internal byte[] key; //public SecretKey() { key = new byte[32]; } public SecretKey(byte[] IKM, string info) { key = new byte[32]; keygen(IKM, info); } public SecretKey(byte[] inp, ByteOrder order=ByteOrder.BigEndian) { key = new byte[32]; switch(order) { case ByteOrder.BigEndian: from_bendian(inp); break; case ByteOrder.LittleEndian: from_lendian(inp); break; } } public void keygen(byte[] IKM, string info="") { if (key == null) key = new byte[32]; byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen(key, IKM, (size_t)IKM.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v3(byte[] IKM, string info="") { if (key == null) key = new byte[32]; byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen_v3(key, IKM, (size_t)IKM.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v4_5(byte[] IKM, string salt, string info="") { if (key == null) key = new byte[32]; byte[] salt_bytes = Encoding.UTF8.GetBytes(salt); byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen_v4_5(key, IKM, (size_t)IKM.Length, salt_bytes, (size_t)salt_bytes.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v5(byte[] IKM, byte[] salt, string info="") { if (key == null) key = new byte[32]; byte[] info_bytes = Encoding.UTF8.GetBytes(info); blst_keygen_v5(key, IKM, (size_t)IKM.Length, salt, (size_t)salt.Length, info_bytes, (size_t)info_bytes.Length); } public void keygen_v5(byte[] IKM, string salt, string info="") { keygen_v5(IKM, Encoding.UTF8.GetBytes(salt), info); } public void derive_master_eip2333(byte[] IKM) { if (key == null) key = new byte[32]; blst_derive_master_eip2333(key, IKM, (size_t)IKM.Length); } public SecretKey(SecretKey master, uint child_index) { key = new byte[32]; blst_derive_child_eip2333(key, master.key, child_index); } public void from_bendian(byte[] inp) { if (inp.Length != 32) throw new Exception(ERROR.BAD_ENCODING); if (key == null) key = new byte[32]; blst_scalar_from_bendian(key, inp); if (!blst_sk_check(key)) throw new Exception(ERROR.BAD_ENCODING); } public void from_lendian(byte[] inp) { if (inp.Length != 32) throw new Exception(ERROR.BAD_ENCODING); if (key == null) key = new byte[32]; blst_scalar_from_lendian(key, inp); if (!blst_sk_check(key)) throw new Exception(ERROR.BAD_ENCODING); } public byte[] to_bendian() { byte[] ret = new byte[32]; blst_bendian_from_scalar(ret, key); return ret; } public byte[] to_lendian() { byte[] ret = new byte[32]; blst_lendian_from_scalar(ret, key); return ret; } } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_be_bytes([Out] byte[] ret, [In] byte[] inp, size_t inp_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_scalar_from_le_bytes([Out] byte[] ret, [In] byte[] inp, size_t inp_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_add_n_check([Out] byte[] ret, [In] byte[] a, [In] byte[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_sub_n_check([Out] byte[] ret, [In] byte[] a, [In] byte[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_sk_mul_n_check([Out] byte[] ret, [In] byte[] a, [In] byte[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sk_inverse([Out] byte[] ret, [In] byte[] a); public struct Scalar { internal byte[] val; //public Scalar() { val = new byte[32]; } public Scalar(byte[] inp, ByteOrder order=ByteOrder.BigEndian) { val = new byte[32]; switch(order) { case ByteOrder.BigEndian: from_bendian(inp); break; case ByteOrder.LittleEndian: from_lendian(inp); break; } } private Scalar(bool _) { val = new byte[32]; } private Scalar(Scalar orig) { val = (byte[])orig.val.Clone(); } public Scalar dup() { return new Scalar(this); } public void from_bendian(byte[] inp) { if (val == null) val = new byte[32]; blst_scalar_from_be_bytes(val, inp, (size_t)inp.Length); } public void from_lendian(byte[] inp) { if (val == null) val = new byte[32]; blst_scalar_from_le_bytes(val, inp, (size_t)inp.Length); } public byte[] to_bendian() { byte[] ret = new byte[32]; blst_bendian_from_scalar(ret, val); return ret; } public byte[] to_lendian() { byte[] ret = new byte[32]; blst_lendian_from_scalar(ret, val); return ret; } public Scalar add(SecretKey a) { if (!blst_sk_add_n_check(val, val, a.key)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar add(Scalar a) { if (!blst_sk_add_n_check(val, val, a.val)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar sub(Scalar a) { if (!blst_sk_sub_n_check(val, val, a.val)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar mul(Scalar a) { if (!blst_sk_mul_n_check(val, val, a.val)) throw new Exception(ERROR.BAD_SCALAR); return this; } public Scalar inverse() { blst_sk_inverse(val, val); return this; } public static Scalar operator+(Scalar a, Scalar b) { return a.dup().add(b); } public static Scalar operator-(Scalar a, Scalar b) { return a.dup().sub(b); } public static Scalar operator*(Scalar a, Scalar b) { return a.dup().mul(b); } public static Scalar operator/(Scalar a, Scalar b) { return b.dup().inverse().mul(a); } } private const int P1_COMPRESSED_SZ = 384/8; private const int P2_COMPRESSED_SZ = 2*P1_COMPRESSED_SZ; [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_p1_affine_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_p1_deserialize([Out] long[] ret, [In] byte[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_affine_serialize([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_affine_compress([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_to_affine([Out] long[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_on_curve([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_in_g1([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_is_inf([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_affine_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern IntPtr blst_p1_generator(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_core_verify_pk_in_g2([In] long[] pk, [In] long[] sig, bool hash_or_encode, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); public struct P1_Affine { internal readonly long[] point; private static readonly int sz = (int)blst_p1_affine_sizeof()/sizeof(long); //public P1_Affine() { point = new long[sz]; } private P1_Affine(bool _) { point = new long[sz]; } private P1_Affine(P1_Affine p) { point = (long[])p.point.Clone(); } public P1_Affine(byte[] inp) : this(true) { int len = inp.Length; if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ : 2*P1_COMPRESSED_SZ)) throw new Exception(ERROR.BAD_ENCODING); ERROR err = blst_p1_deserialize(point, inp); if (err != ERROR.SUCCESS) throw new Exception(err); } public P1_Affine(P1 jacobian) : this(true) { blst_p1_to_affine(point, jacobian.point); } public P1_Affine dup() { return new P1_Affine(this); } public P1 to_jacobian() { return new P1(this); } public byte[] serialize() { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; blst_p1_affine_serialize(ret, point); return ret; } public byte[] compress() { byte[] ret = new byte[P1_COMPRESSED_SZ]; blst_p1_affine_compress(ret, point); return ret; } public bool on_curve() { return blst_p1_affine_on_curve(point); } public bool in_group() { return blst_p1_affine_in_g1(point); } public bool is_inf() { return blst_p1_affine_is_inf(point); } public bool is_equal(P1_Affine p) { return blst_p1_affine_is_equal(point, p.point); } ERROR core_verify(P2_Affine pk, bool hash_or_encode, byte[] msg, string DST = "", byte[] aug = null) { byte[] dst = Encoding.UTF8.GetBytes(DST); return blst_core_verify_pk_in_g2(pk.point, point, hash_or_encode, msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public static P1_Affine generator() { var ret = new P1_Affine(true); Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); return ret; } } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_p1_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_serialize([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_compress([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_from_affine([Out] long[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_on_curve([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_in_g1([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_is_inf([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p1_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sk_to_pk_in_g1([Out] long[] ret, [In] byte[] SK); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_encode_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_hash_to_g1([Out] long[] ret, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sign_pk_in_g2([Out] long[] ret, [In] long[] hash, [In] byte[] SK); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_mult([Out] long[] ret, [In] long[] a, [In] byte[] scalar, size_t nbits); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_cneg([Out] long[] ret, bool cbit); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_add_or_double([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_add_or_double_affine([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p1_double([Out] long[] ret, [In] long[] a); public struct P1 { internal long[] point; private static readonly int sz = (int)blst_p1_sizeof()/sizeof(long); //public P1() { point = new long[sz]; } private P1(bool _) { point = new long[sz]; } private P1(P1 p) { point = (long[])p.point.Clone(); } private long[] self() { if (point==null) { point = new long[sz]; } return point; } public P1(SecretKey sk) : this(true) { blst_sk_to_pk_in_g1(point, sk.key); } public P1(byte[] inp) : this(true) { int len = inp.Length; if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P1_COMPRESSED_SZ : 2*P1_COMPRESSED_SZ)) throw new Exception(ERROR.BAD_ENCODING); ERROR err = blst_p1_deserialize(point, inp); if (err != ERROR.SUCCESS) throw new Exception(err); blst_p1_from_affine(point, point); } public P1(P1_Affine affine) : this(true) { blst_p1_from_affine(point, affine.point); } public P1 dup() { return new P1(this); } public P1_Affine to_affine() { return new P1_Affine(this); } public byte[] serialize() { byte[] ret = new byte[2*P1_COMPRESSED_SZ]; blst_p1_serialize(ret, point); return ret; } public byte[] compress() { byte[] ret = new byte[P1_COMPRESSED_SZ]; blst_p1_compress(ret, point); return ret; } public bool on_curve() { return blst_p1_on_curve(point); } public bool in_group() { return blst_p1_in_g1(point); } public bool is_inf() { return blst_p1_is_inf(point); } public bool is_equal(P1 p) { return blst_p1_is_equal(point, p.point); } public P1 hash_to(byte[] msg, string DST="", byte[] aug=null) { byte[] dst = Encoding.UTF8.GetBytes(DST); blst_hash_to_g1(self(), msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); return this; } public P1 encode_to(byte[] msg, string DST="", byte[] aug=null) { byte[] dst = Encoding.UTF8.GetBytes(DST); blst_encode_to_g1(self(), msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); return this; } public P1 sign_with(SecretKey sk) { blst_sign_pk_in_g2(point, point, sk.key); return this; } public P1 sign_with(Scalar scalar) { blst_sign_pk_in_g2(point, point, scalar.val); return this; } public void aggregate(P1_Affine inp) { if (blst_p1_affine_in_g1(inp.point)) blst_p1_add_or_double_affine(point, point, inp.point); else throw new Exception(ERROR.POINT_NOT_IN_GROUP); } public P1 mult(byte[] scalar) { blst_p1_mult(point, point, scalar, (size_t)(scalar.Length*8)); return this; } public P1 mult(Scalar scalar) { blst_p1_mult(point, point, scalar.val, (size_t)255); return this; } public P1 mult(BigInteger scalar) { byte[] val; if (scalar.Sign < 0) { val = BigInteger.Negate(scalar).ToByteArray(); blst_p1_cneg(point, true); } else { val = scalar.ToByteArray(); } int len = val.Length; if (val[len-1]==0) len--; blst_p1_mult(point, point, val, (size_t)(len*8)); return this; } public P1 cneg(bool flag) { blst_p1_cneg(point, flag); return this; } public P1 neg() { blst_p1_cneg(point, true); return this; } public P1 add(P1 a) { blst_p1_add_or_double(point, point, a.point); return this; } public P1 add(P1_Affine a) { blst_p1_add_or_double_affine(point, point, a.point); return this; } public P1 dbl() { blst_p1_double(point, point); return this; } public static P1 generator() { var ret = new P1(true); Marshal.Copy(blst_p1_generator(), ret.point, 0, ret.point.Length); return ret; } } public static P1 G1() { return P1.generator(); } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_aggregated_in_g1([Out] long[] fp12, [In] long[] p); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_aggregate_pk_in_g1([In, Out] long[] fp12, [In] long[] pk, [In] long[] sig, [In] byte[] msg, size_t msg_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_mul_n_aggregate_pk_in_g1([In, Out] long[] fp12, [In] long[] pk, [In] long[] sig, [In] byte[] scalar, size_t nbits, [In] byte[] msg, size_t msg_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_p2_affine_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_p2_deserialize([Out] long[] ret, [In] byte[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_affine_serialize([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_affine_compress([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_to_affine([Out] long[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_affine_on_curve([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_affine_in_g2([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_affine_is_inf([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_affine_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern IntPtr blst_p2_generator(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_core_verify_pk_in_g1([In] long[] pk, [In] long[] sig, bool hash_or_encode, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); public struct P2_Affine { internal readonly long[] point; private static readonly int sz = (int)blst_p2_affine_sizeof()/sizeof(long); //public P2_Affine() { point = new long[sz]; } private P2_Affine(bool _) { point = new long[sz]; } private P2_Affine(P2_Affine p) { point = (long[])p.point.Clone(); } public P2_Affine(byte[] inp) : this(true) { int len = inp.Length; if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P2_COMPRESSED_SZ : 2*P2_COMPRESSED_SZ)) throw new Exception(ERROR.BAD_ENCODING); ERROR err = blst_p2_deserialize(point, inp); if (err != ERROR.SUCCESS) throw new Exception(err); } public P2_Affine(P2 jacobian) : this(true) { blst_p2_to_affine(point, jacobian.point); } public P2_Affine dup() { return new P2_Affine(this); } public P2 to_jacobian() { return new P2(this); } public byte[] serialize() { byte[] ret = new byte[2*P2_COMPRESSED_SZ]; blst_p2_affine_serialize(ret, point); return ret; } public byte[] compress() { byte[] ret = new byte[P2_COMPRESSED_SZ]; blst_p2_affine_compress(ret, point); return ret; } public bool on_curve() { return blst_p2_affine_on_curve(point); } public bool in_group() { return blst_p2_affine_in_g2(point); } public bool is_inf() { return blst_p2_affine_is_inf(point); } public bool is_equal(P2_Affine p) { return blst_p2_affine_is_equal(point, p.point); } ERROR core_verify(P1_Affine pk, bool hash_or_encode, byte[] msg, string DST = "", byte[] aug = null) { byte[] dst = Encoding.UTF8.GetBytes(DST); return blst_core_verify_pk_in_g1(pk.point, point, hash_or_encode, msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public static P2_Affine generator() { var ret = new P2_Affine(true); Marshal.Copy(blst_p2_generator(), ret.point, 0, ret.point.Length); return ret; } } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_p2_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_serialize([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_compress([Out] byte[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_from_affine([Out] long[] ret, [In] long[] inp); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_on_curve([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_in_g2([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_is_inf([In] long[] point); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_p2_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sk_to_pk_in_g2([Out] long[] ret, [In] byte[] SK); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_encode_to_g2([Out] long[] ret, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_hash_to_g2([Out] long[] ret, [In] byte[] msg, size_t msg_len, [In] byte[] dst, size_t dst_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_sign_pk_in_g1([Out] long[] ret, [In] long[] hash, [In] byte[] SK); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_mult([Out] long[] ret, [In] long[] a, [In] byte[] scalar, size_t nbits); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_cneg([Out] long[] ret, bool cbit); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_add_or_double([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_add_or_double_affine([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_p2_double([Out] long[] ret, [In] long[] a); public struct P2 { internal long[] point; private static readonly int sz = (int)blst_p2_sizeof()/sizeof(long); //public P2() { point = new long[sz]; } private P2(bool _) { point = new long[sz]; } private P2(P2 p) { point = (long[])p.point.Clone(); } private long[] self() { if (point==null) { point = new long[sz]; } return point; } public P2(SecretKey sk) : this(true) { blst_sk_to_pk_in_g2(point, sk.key); } public P2(byte[] inp) : this(true) { int len = inp.Length; if (len == 0 || len != ((inp[0]&0x80) == 0x80 ? P2_COMPRESSED_SZ : 2*P2_COMPRESSED_SZ)) throw new Exception(ERROR.BAD_ENCODING); ERROR err = blst_p2_deserialize(point, inp); if (err != ERROR.SUCCESS) throw new Exception(err); blst_p2_from_affine(point, point); } public P2(P2_Affine affine) : this(true) { blst_p2_from_affine(point, affine.point); } public P2 dup() { return new P2(this); } public P2_Affine to_affine() { return new P2_Affine(this); } public byte[] serialize() { byte[] ret = new byte[2*P2_COMPRESSED_SZ]; blst_p2_serialize(ret, point); return ret; } public byte[] compress() { byte[] ret = new byte[P2_COMPRESSED_SZ]; blst_p2_compress(ret, point); return ret; } public bool on_curve() { return blst_p2_on_curve(point); } public bool in_group() { return blst_p2_in_g2(point); } public bool is_inf() { return blst_p2_is_inf(point); } public bool is_equal(P2 p) { return blst_p2_is_equal(point, p.point); } public P2 hash_to(byte[] msg, string DST="", byte[] aug=null) { byte[] dst = Encoding.UTF8.GetBytes(DST); blst_hash_to_g2(self(), msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); return this; } public P2 encode_to(byte[] msg, string DST="", byte[] aug=null) { byte[] dst = Encoding.UTF8.GetBytes(DST); blst_encode_to_g2(self(), msg, (size_t)msg.Length, dst, (size_t)dst.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); return this; } public P2 sign_with(SecretKey sk) { blst_sign_pk_in_g1(point, point, sk.key); return this; } public P2 sign_with(Scalar scalar) { blst_sign_pk_in_g1(point, point, scalar.val); return this; } public void aggregate(P2_Affine inp) { if (blst_p2_affine_in_g2(inp.point)) blst_p2_add_or_double_affine(point, point, inp.point); else throw new Exception(ERROR.POINT_NOT_IN_GROUP); } public P2 mult(byte[] scalar) { blst_p2_mult(point, point, scalar, (size_t)(scalar.Length*8)); return this; } public P2 mult(Scalar scalar) { blst_p2_mult(point, point, scalar.val, (size_t)255); return this; } public P2 mult(BigInteger scalar) { byte[] val; if (scalar.Sign < 0) { val = BigInteger.Negate(scalar).ToByteArray(); blst_p2_cneg(point, true); } else { val = scalar.ToByteArray(); } int len = val.Length; if (val[len-1]==0) len--; blst_p2_mult(point, point, val, (size_t)(len*8)); return this; } public P2 cneg(bool flag) { blst_p2_cneg(point, flag); return this; } public P2 neg() { blst_p2_cneg(point, true); return this; } public P2 add(P2 a) { blst_p2_add_or_double(point, point, a.point); return this; } public P2 add(P2_Affine a) { blst_p2_add_or_double_affine(point, point, a.point); return this; } public P2 dbl() { blst_p2_double(point, point); return this; } public static P2 generator() { var ret = new P2(true); Marshal.Copy(blst_p2_generator(), ret.point, 0, ret.point.Length); return ret; } } public static P2 G2() { return P2.generator(); } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_aggregated_in_g2([Out] long[] fp12, [In] long[] p); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_aggregate_pk_in_g2([In, Out] long[] fp12, [In] long[] pk, [In] long[] sig, [In] byte[] msg, size_t msg_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_mul_n_aggregate_pk_in_g2([In, Out] long[] fp12, [In] long[] pk, [In] long[] sig, [In] byte[] scalar, size_t nbits, [In] byte[] msg, size_t msg_len, [In] byte[] aug, size_t aug_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_fp12_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_miller_loop([Out] long[] fp12, [In] long[] q, [In] long[] p); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_is_one([In] long[] fp12); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_is_equal([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_fp12_sqr([Out] long[] ret, [In] long[] a); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_fp12_mul([Out] long[] ret, [In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_final_exp([Out] long[] ret, [In] long[] a); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_finalverify([In] long[] a, [In] long[] b); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern IntPtr blst_fp12_one(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_fp12_in_group([In] long[] a); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_bendian_from_fp12([Out] byte[] ret, [In] long[] a); public struct PT { internal readonly long[] fp12; private static readonly int sz = (int)blst_fp12_sizeof()/sizeof(long); internal PT(bool _) { fp12 = new long[sz]; } private PT(PT orig) { fp12 = (long[])orig.fp12.Clone(); } public PT(P1_Affine p) : this(true) { blst_aggregated_in_g1(fp12, p.point); } public PT(P1 p) : this(true) { blst_aggregated_in_g1(fp12, (new P1_Affine(p)).point); } public PT(P2_Affine q) : this(true) { blst_aggregated_in_g2(fp12, q.point); } public PT(P2 q) : this(true) { blst_aggregated_in_g2(fp12, (new P2_Affine(q)).point); } public PT(P2_Affine q, P1_Affine p) : this(true) { blst_miller_loop(fp12, q.point, p.point); } public PT(P1_Affine p, P2_Affine q) : this(q, p) {} public PT(P2 q, P1 p) : this(true) { blst_miller_loop(fp12, (new P2_Affine(q)).point, (new P1_Affine(p)).point); } public PT(P1 p, P2 q) : this(q, p) {} public PT dup() { return new PT(this); } public bool is_one() { return blst_fp12_is_one(fp12); } public bool is_equal(PT p) { return blst_fp12_is_equal(fp12, p.fp12); } public PT sqr() { blst_fp12_sqr(fp12, fp12); return this; } public PT mul(PT p) { blst_fp12_mul(fp12, fp12, p.fp12); return this; } public PT final_exp() { blst_final_exp(fp12, fp12); return this; } public bool in_group() { return blst_fp12_in_group(fp12); } public byte[] to_bendian() { byte[] ret = new byte[12*P1_COMPRESSED_SZ]; blst_bendian_from_fp12(ret, fp12); return ret; } public static bool finalverify(PT gt1, PT gt2) { return blst_fp12_finalverify(gt1.fp12, gt2.fp12); } public static PT one() { var ret = new PT(true); Marshal.Copy(blst_fp12_one(), ret.fp12, 0, ret.fp12.Length); return ret; } } [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern size_t blst_pairing_sizeof(); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_pairing_init([In, Out] long[] ctx, bool hash_or_encode, [In] ref long dst, size_t dst_len); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_pairing_commit([In, Out] long[] ctx); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern ERROR blst_pairing_merge([In, Out] long[] ctx, [In] long[] ctx1); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern bool blst_pairing_finalverify([In] long[] ctx, [In] long[] sig); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern void blst_pairing_raw_aggregate([In, Out] long[] ctx, [In] long[] q, [In] long[] p); [DllImport("blst.dll", CallingConvention = CallingConvention.Cdecl)] static extern IntPtr blst_pairing_as_fp12([In] long[] ctx); public struct Pairing { private readonly long[] ctx; private static readonly int sz = (int)blst_pairing_sizeof()/sizeof(long); public Pairing(bool hash_or_encode=false, string DST="") { byte[] dst = Encoding.UTF8.GetBytes(DST); int dst_len = dst.Length; int add_len = dst_len!=0 ? (dst_len+sizeof(long)-1)/sizeof(long) : 1; Array.Resize(ref dst, add_len*sizeof(long)); ctx = new long[sz+add_len]; for (int i=0; i sig, byte[] msg, byte[] aug=null) { return blst_pairing_aggregate_pk_in_g1(ctx, pk.point, sig.HasValue ? sig.Value.point : null, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public ERROR aggregate(P2_Affine pk, Nullable sig, byte[] msg, byte[] aug=null) { return blst_pairing_aggregate_pk_in_g2(ctx, pk.point, sig.HasValue ? sig.Value.point : null, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public ERROR mul_n_aggregate(P2_Affine pk, P1_Affine sig, byte[] scalar, int nbits, byte[] msg, byte[] aug=null) { return blst_pairing_mul_n_aggregate_pk_in_g2(ctx, pk.point, sig.point, scalar, (size_t)nbits, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public ERROR mul_n_aggregate(P1_Affine pk, P2_Affine sig, byte[] scalar, int nbits, byte[] msg, byte[] aug=null) { return blst_pairing_mul_n_aggregate_pk_in_g1(ctx, pk.point, sig.point, scalar, (size_t)nbits, msg, (size_t)msg.Length, aug, (size_t)(aug!=null ? aug.Length : 0)); } public void commit() { blst_pairing_commit(ctx); } public void merge(Pairing a) { var err = blst_pairing_merge(ctx, a.ctx); if (err != ERROR.SUCCESS) throw new Exception(err); } public bool finalverify(PT sig=new PT()) { return blst_pairing_finalverify(ctx, sig.fp12); } public void raw_aggregate(P2_Affine q, P1_Affine p) { blst_pairing_raw_aggregate(ctx, q.point, p.point); } public void raw_aggregate(P1_Affine p, P2_Affine q) { raw_aggregate(q, p); } public void raw_aggregate(P2 q, P1 p) { blst_pairing_raw_aggregate(ctx, (new P2_Affine(q)).point, (new P1_Affine(p)).point); } public void raw_aggregate(P1 p, P2 q) { raw_aggregate(q, p); } public PT as_fp12() { var ret = new PT(true); GCHandle h = GCHandle.Alloc(ctx, GCHandleType.Pinned); Marshal.Copy(blst_pairing_as_fp12(ctx), ret.fp12, 0, ret.fp12.Length); h.Free(); return ret; } } }} ================================================ FILE: bindings/go/README.md ================================================ # blst [![Lint Status](https://github.com/supranational/blst/workflows/golang-lint/badge.svg)](https://github.com/supranational/blst/actions/workflows/golang-lint.yml) The `blst` package provides a Go interface to the blst BLS12-381 signature library. ## Build The build process consists of two steps, code generation followed by compilation. ``` ./generate.py # Optional - only required if making code changes go build go test ``` The generate.py script is used to generate both min-pk and min-sig variants of the binding from a common code base. It consumes the `*.tgo` files along with `blst_minpk_test.go` and produces `blst.go` and `blst_minsig_test.go`. The .tgo files can treated as if they were .go files, including the use of gofmt and goimports. The generate script will filter out extra imports while processing and automatically run goimports on the final blst.go file. After running generate.py, `go build` and `go test` can be run as usual. Cgo will compile `cgo_server.c`, which includes the required C implementation files, and `cgo_assembly.S`, which includes appropriate pre-generated assembly code for the platform. #### Caveats If the test or target application crashes with an "illegal instruction" exception [after copying to an older system], rebuild with `CGO_CFLAGS` environment variable set to `-O2 -D__BLST_PORTABLE__`. Don't forget `-O2`! On Windows the C compiler invoked by cgo, one denoted in `go env CC` output, has to target [MinGW](https://www.mingw-w64.org/). Verify with ` -dM -E -x c nul: | findstr "MINGW64"`. If you're cross-compiling, you have to set `CC` environment variable to the target C cross-compiler and `CGO_ENABLED` to 1. For example, to compile the test program for ARM: ``` env GOARCH=arm CC=arm-linux-gnueabi-gcc CGO_ENABLED=1 go test -c ``` ## Usage There are two primary modes of operation that can be chosen based on type definitions in the application. For minimal-pubkey-size operations the application would define core types as: ``` type PublicKey = blst.P1Affine type Signature = blst.P2Affine type AggregateSignature = blst.P2Aggregate type AggregatePublicKey = blst.P1Aggregate ``` For minimal-signature-size operations: ``` type PublicKey = blst.P2Affine type Signature = blst.P1Affine type AggregateSignature = blst.P1Aggregate type AggregatePublicKey = blst.P2Aggregate ``` A complete example for generating a key, signing a message, and verifying the message: ``` package main import ( "crypto/rand" "fmt" blst "github.com/supranational/blst/bindings/go" ) type PublicKey = blst.P1Affine type Signature = blst.P2Affine type AggregateSignature = blst.P2Aggregate type AggregatePublicKey = blst.P1Aggregate func main() { var ikm [32]byte _, _ = rand.Read(ikm[:]) sk := blst.KeyGen(ikm[:]) pk := new(PublicKey).From(sk) var dst = []byte("BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_") msg := []byte("hello foo") sig := new(Signature).Sign(sk, msg, dst) if !sig.Verify(true, pk, true, msg, dst) { fmt.Println("ERROR: Invalid!") } else { fmt.Println("Valid!") } } ``` See the tests for further examples of usage. ## Core Methods ### SecretKey Methods - `KeyGen(ikm []byte, optional ...[]byte) *SecretKey` - Derive the secret key scalar from secret input key material, optionally application-specific - `Serialize() []byte` - Serialize the secret key to bytes - `Deserialize(data []byte) *SecretKey` - Deserialize secret key from bytes - `Zeroize()` - Securely zero out the secret key ### PublicKey (P1Affine in minimal-pubkey-size) Methods - `From(sk *SecretKey) *PublicKey` - Derive public key from secret key - `Compress() []byte` - Serialize public key to compressed format - `Uncompress(data []byte) *PublicKey` - Decompress public key from bytes - `Serialize() []byte` - Serialize public key to uncompressed format - `Deserialize(data []byte) *PublicKey` - Deserialize public key from bytes ### Signature (P2Affine in minimal-pubkey-size) Methods - `Sign(sk *SecretKey, msg []byte, dst []byte, ...interface{}) *Signature` - Sign a message - `Compress() []byte` - Serialize signature to compressed format - `Uncompress(data []byte) *Signature` - Decompress signature from bytes - `BatchUncompress(compressedSigs [][]byte) []*Signature` - Efficiently uncompress multiple signatures - `Serialize() []byte` - Serialize public key to uncompressed format - `Deserialize(data []byte) *Signature` - Deserialize public key from bytes - `Verify(sigCheck bool, pk *PublicKey, pkCheck bool, msg []byte, dst []byte, ...interface{}) bool` - Verify a signature - `VerifyCompressed(sig []byte, sigCheck bool, pk []byte, msgCheck bool, msg []byte, dst []byte, ...interface{}) bool` - Verify a serialized signature in compressed format - `AggregateVerify(sigCheck bool, pks []*PublicKey, msgCheck bool, msgs [][]byte, dst []byte) bool` - Verify an aggregated signature for multiple messages - `AggregateVerifyCompressed(sig []byte, sigCheck bool, pks [][]byte, msgCheck bool, msgs [][]byte, dst []byte) bool` - Verify an aggregated serialized signature in compressed format - `FastAggregateVerify(sigCheck bool, pks []*PublicKey, msg []byte, dst []byte) bool` - Fast verify for same message - `MultipleAggregateVerify(sigs []*Signature, sigCheck bool, pks []*PublicKey, msgCheck bool, msgs [][]byte, dst []byte, randFn func(*Scalar), randBits int) bool` - Verify multiple signatures ### Aggregate Methods - `AggregatePublicKey.Aggregate(pks []*PublicKey, check bool)` - Aggregate multiple public keys - `AggregateSignature.Aggregate(sigs []*Signature, check bool)` - Aggregate multiple signatures - `AggregateSignature.AggregateCompressed(compressedSigs [][]byte, check bool)` - Aggregate muliple serialized signatures in compressed format - `AggregatePublicKey.ToAffine() *PublicKey` - Convert aggregate to affine form - `AggrefateSignature.ToAffine() *Signature` - Convert aggregate to affine form ## Utility Functions - `HashToG1(msg []byte, dst []byte, optional... []byte) *P1` - Hash message [with optional augmentation] to G1 point - `HashToG2(msg []byte, dst []byte, optional... []byte) *P2` - Hash message [with optional augmentation] to G2 point - `P1Generator() *P1` - Get G1 generator point - `P2Generator() *P2` - Get G2 generator point - `Uniq(msgs [][]byte)` - Check messages for uniqueness - `SetMaxProcs(procs int)` - Set maximum number of threads for parallel operations ================================================ FILE: bindings/go/blst.go ================================================ // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // DO NOT MODIFY THIS FILE!! // The file is generated from *.tgo by generate.py // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ package blst // #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // // no-asm 64-bit platforms from https://go.dev/doc/install/source // #cgo loong64 mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // // #include "blst.h" // // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) // # include // # include // static void handler(int signum) // { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " // "consult /bindings/go/README.md.\n", 70); // _exit(128+SIGILL); // (void)n; // } // __attribute__((constructor)) static void blst_cgo_init() // { blst_fp temp = { 0 }; // struct sigaction act = { handler }, oact; // sigaction(SIGILL, &act, &oact); // blst_fp_sqr(&temp, &temp); // sigaction(SIGILL, &oact, NULL); // } // #endif // // static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, // const byte *DST, size_t DST_len) // { if (DST != NULL) { // byte *dst = (byte*)new_ctx + blst_pairing_sizeof(); // for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i]; // DST = dst; // } // blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len); // } // static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx) // { *pt = *blst_pairing_as_fp12(ctx); } // // static void go_p1slice_to_affine(blst_p1_affine dst[], // const blst_p1 points[], size_t npoints) // { const blst_p1 *ppoints[2] = { points, NULL }; // blst_p1s_to_affine(dst, ppoints, npoints); // } // static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[], // size_t npoints) // { const blst_p1_affine *ppoints[2] = { points, NULL }; // blst_p1s_add(dst, ppoints, npoints); // } // static void go_p2slice_to_affine(blst_p2_affine dst[], // const blst_p2 points[], size_t npoints) // { const blst_p2 *ppoints[2] = { points, NULL }; // blst_p2s_to_affine(dst, ppoints, npoints); // } // static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[], // size_t npoints) // { const blst_p2_affine *ppoints[2] = { points, NULL }; // blst_p2s_add(dst, ppoints, npoints); // } // // static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine, // const byte *scalar, size_t nbits) // { blst_p1 m[1]; // const void *p = x; // if (p == NULL) // p = blst_p1_generator(); // else if (affine) // blst_p1_from_affine(m, p), p = m; // blst_p1_mult(m, p, scalar, nbits); // blst_p1_add_or_double(acc, acc, m); // } // static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine, // const byte *scalar, size_t nbits) // { blst_p2 m[1]; // const void *p = x; // if (p == NULL) // p = blst_p2_generator(); // else if (affine) // blst_p2_from_affine(m, p), p = m; // blst_p2_mult(m, p, scalar, nbits); // blst_p2_add_or_double(acc, acc, m); // } // // static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine) // { blst_p1 minus_b; // if (affine) // blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x); // else // minus_b = *(const blst_p1*)x; // blst_p1_cneg(&minus_b, 1); // blst_p1_add_or_double(a, a, &minus_b); // } // // static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine) // { blst_p2 minus_b; // if (affine) // blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x); // else // minus_b = *(const blst_p2*)x; // blst_p2_cneg(&minus_b, 1); // blst_p2_add_or_double(a, a, &minus_b); // } // // static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in) // { blst_scalar_from_bendian(ret, in); // return blst_sk_check(ret); // } // static bool go_hash_to_scalar(blst_scalar *ret, // const byte *msg, size_t msg_len, // const byte *DST, size_t DST_len) // { byte elem[48]; // blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); // return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); // } // static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[], // const blst_p1_affine P[], // size_t npoints, bool acc) // { const blst_p2_affine *Qs[2] = { Q, NULL }; // const blst_p1_affine *Ps[2] = { P, NULL }; // if (acc) { // blst_fp12 tmp; // blst_miller_loop_n(&tmp, Qs, Ps, npoints); // blst_fp12_mul(dst, dst, &tmp); // } else { // blst_miller_loop_n(dst, Qs, Ps, npoints); // } // } // static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n) // { size_t i; // blst_fp12_mul(dst, &in[0], &in[1]); // for (i = 2; i < n; i++) // blst_fp12_mul(dst, dst, &in[i]); // } // static bool go_p1_affine_validate(const blst_p1_affine *p, bool infcheck) // { if (infcheck && blst_p1_affine_is_inf(p)) // return 0; // return blst_p1_affine_in_g1(p); // } // static bool go_p2_affine_validate(const blst_p2_affine *p, bool infcheck) // { if (infcheck && blst_p2_affine_is_inf(p)) // return 0; // return blst_p2_affine_in_g2(p); // } import "C" import ( "fmt" "math/bits" "runtime" "sync" "sync/atomic" "unsafe" ) const BLST_SCALAR_BYTES = 256 / 8 const BLST_FP_BYTES = 384 / 8 const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 type Scalar struct{ cgo C.blst_scalar } type Fp struct{ cgo C.blst_fp } type Fp2 struct{ cgo C.blst_fp2 } type Fp6 = C.blst_fp6 type Fp12 struct{ cgo C.blst_fp12 } type P1 struct{ cgo C.blst_p1 } type P2 struct{ cgo C.blst_p2 } type P1Affine struct{ cgo C.blst_p1_affine } type P2Affine struct{ cgo C.blst_p2_affine } type Message = []byte type Pairing = []C.blst_pairing type SecretKey = Scalar type P1s []P1 type P2s []P2 type P1Affines []P1Affine type P2Affines []P2Affine // // Configuration // var maxProcs = initMaxProcs() func initMaxProcs() int { maxProcs := runtime.GOMAXPROCS(0) var version float32 _, err := fmt.Sscanf(runtime.Version(), "go%f", &version) if err != nil || version < 1.14 { // be cooperative and leave one processor for the application maxProcs -= 1 } if maxProcs <= 0 { maxProcs = 1 } return maxProcs } func SetMaxProcs(procs int) { if procs <= 0 { procs = 1 } maxProcs = procs } func numThreads(maxThreads int) int { numThreads := maxProcs // take into consideration the possility that application reduced // GOMAXPROCS after |maxProcs| was initialized numProcs := runtime.GOMAXPROCS(0) if maxProcs > numProcs { numThreads = numProcs } if maxThreads > 0 && numThreads > maxThreads { return maxThreads } return numThreads } var cgo_pairingSizeOf = C.blst_pairing_sizeof() var cgo_p1Generator = P1{*C.blst_p1_generator()} var cgo_p2Generator = P2{*C.blst_p2_generator()} var cgo_fp12One = Fp12{*C.blst_fp12_one()} // Secret key func (sk *SecretKey) Zeroize() { var zero SecretKey *sk = zero } func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } if len(ikm) < 32 { return nil } C.blst_keygen(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } C.blst_keygen_v3(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } C.blst_keygen_v4_5(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), (*C.byte)(&salt[0]), C.size_t(len(salt)), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } saltLen := len(salt) if saltLen == 0 { salt = []byte{0} } C.blst_keygen_v5(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), (*C.byte)(&salt[0]), C.size_t(saltLen), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func DeriveMasterEip2333(ikm []byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey C.blst_derive_master_eip2333(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey { var sk SecretKey C.blst_derive_child_eip2333(&sk.cgo, &master.cgo, C.uint(child_index)) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } // Pairing func pairingSizeOf(DST_len C.size_t) int { return int((cgo_pairingSizeOf + DST_len + 7) / 8) } func PairingCtx(hash_or_encode bool, DST []byte) Pairing { DST_len := C.size_t(len(DST)) ctx := make([]C.blst_pairing, pairingSizeOf(DST_len)) C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), ptrOrNil(DST), DST_len) return ctx } func PairingCommit(ctx Pairing) { C.blst_pairing_commit(&ctx[0]) } func PairingMerge(ctx Pairing, ctx1 Pairing) int { r := C.blst_pairing_merge(&ctx[0], &ctx1[0]) return int(r) } func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { var gtsig *Fp12 if len(optional) > 0 { gtsig = optional[0] } return bool(C.blst_pairing_finalverify(&ctx[0], gtsig.asPtr())) } func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) { C.blst_pairing_raw_aggregate(&ctx[0], &q.cgo, &p.cgo) } func PairingAsFp12(ctx Pairing) *Fp12 { var pt Fp12 C.go_pairing_as_fp12(&pt.cgo, &ctx[0]) return &pt } func Fp12One() Fp12 { return cgo_fp12One } func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool { return bool(C.blst_fp12_finalverify(&pt1.cgo, &pt2.cgo)) } func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { var pt Fp12 C.blst_miller_loop(&pt.cgo, &q.cgo, &p.cgo) return &pt } func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 { if len(qs) != len(ps) || len(qs) == 0 { panic("inputs' lengths mismatch") } nElems := uint32(len(qs)) nThreads := uint32(maxProcs) if nThreads == 1 || nElems == 1 { var pt Fp12 C.go_miller_loop_n(&pt.cgo, &qs[0].cgo, &ps[0].cgo, C.size_t(nElems), false) return &pt } stride := (nElems + nThreads - 1) / nThreads if stride > 16 { stride = 16 } strides := (nElems + stride - 1) / stride if nThreads > strides { nThreads = strides } msgsCh := make(chan Fp12, nThreads) curElem := uint32(0) for tid := uint32(0); tid < nThreads; tid++ { go func() { acc := Fp12One() first := true for { work := atomic.AddUint32(&curElem, stride) - stride if work >= nElems { break } n := nElems - work if n > stride { n = stride } C.go_miller_loop_n(&acc.cgo, &qs[work].cgo, &ps[work].cgo, C.size_t(n), C.bool(!first)) first = false } msgsCh <- acc }() } var ret = make([]Fp12, nThreads) for i := range ret { ret[i] = <-msgsCh } var pt Fp12 C.go_fp12slice_mul(&pt.cgo, &ret[0].cgo, C.size_t(nThreads)) return &pt } func (pt *Fp12) MulAssign(p *Fp12) { C.blst_fp12_mul(&pt.cgo, &pt.cgo, &p.cgo) } func (pt *Fp12) FinalExp() { C.blst_final_exp(&pt.cgo, &pt.cgo) } func (pt *Fp12) InGroup() bool { return bool(C.blst_fp12_in_group(&pt.cgo)) } func (pt *Fp12) ToBendian() []byte { var out [BLST_FP_BYTES * 12]byte C.blst_bendian_from_fp12((*C.byte)(&out[0]), &pt.cgo) return out[:] } func (pt1 *Fp12) Equals(pt2 *Fp12) bool { return *pt1 == *pt2 } func (pt *Fp12) asPtr() *C.blst_fp12 { if pt != nil { return &pt.cgo } return nil } func ptrOrNil(bytes []byte) *C.byte { var ptr *C.byte if len(bytes) > 0 { ptr = (*C.byte)(&bytes[0]) } return ptr } // // MIN-PK // // // PublicKey // func (pk *P1Affine) From(s *Scalar) *P1Affine { C.blst_sk_to_pk2_in_g1(nil, &pk.cgo, &s.cgo) return pk } func (pk *P1Affine) KeyValidate() bool { return bool(C.go_p1_affine_validate(&pk.cgo, true)) } // sigInfcheck, check for infinity, is a way to avoid going // into resource-consuming verification. Passing 'false' is // always cryptographically safe, but application might want // to guard against obviously bogus individual[!] signatures. func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { return bool(C.go_p2_affine_validate(&sig.cgo, C.bool(sigInfcheck))) } // // Sign // func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, optional ...interface{}) *P2Affine { augSingle, aug, useHash, ok := parseOpts(optional...) if !ok || len(aug) != 0 { return nil } var q *P2 if useHash { q = HashToG2(msg, dst, augSingle) } else { q = EncodeToG2(msg, dst, augSingle) } C.blst_sign_pk2_in_g1(nil, &sig.cgo, &q.cgo, &sk.cgo) return sig } // // Signature // // Functions to return a signature and public key+augmentation tuple. // This enables point decompression (if needed) to happen in parallel. type sigGetterP2 func() *P2Affine type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) // Single verify with decompressed pk func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool, msg Message, dst []byte, optional ...interface{}) bool { // useHash bool, aug []byte aug, _, useHash, ok := parseOpts(optional...) if !ok { return false } return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate, []Message{msg}, dst, useHash, [][]byte{aug}) } // Single verify with compressed pk // Uses a dummy signature to get the correct type func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, pk []byte, pkValidate bool, msg Message, dst []byte, optional ...bool) bool { // useHash bool, usePksAsAugs bool return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, [][]byte{pk}, pkValidate, []Message{msg}, dst, optional...) } // Aggregate verify with uncompressed signature and public keys // Note that checking message uniqueness, if required, is left to the user. // Not all signature schemes require it and this keeps the binding minimal // and fast. Refer to the Uniq function for one method method of performing // this check. func (sig *P2Affine) AggregateVerify(sigGroupcheck bool, pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, optional ...interface{}) bool { // useHash bool, augs [][]byte // sanity checks and argument parsing n := len(pks) if n == 0 || len(msgs) != n { return false } _, augs, useHash, ok := parseOpts(optional...) useAugs := len(augs) != 0 if !ok || (useAugs && len(augs) != n) { return false } sigFn := func() *P2Affine { return sig } pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { if useAugs { return pks[i], augs[i] } return pks[i], nil } return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, msgs, dst, useHash) } // Aggregate verify with compressed signature and public keys // Uses a dummy signature to get the correct type func (*P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, pks [][]byte, pksVerify bool, msgs []Message, dst []byte, optional ...bool) bool { // useHash bool, usePksAsAugs bool // sanity checks and argument parsing if len(pks) != len(msgs) { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } usePksAsAugs := false if len(optional) > 1 { usePksAsAugs = optional[1] } sigFn := func() *P2Affine { sigP := new(P2Affine) if sigP.Uncompress(sig) == nil { return nil } return sigP } pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { bytes := pks[i] if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { // Not compressed if pk.Deserialize(bytes) == nil { return nil, nil } } else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { if pk.Uncompress(bytes) == nil { return nil, nil } } else { return nil, nil } if usePksAsAugs { return pk, bytes } return pk, nil } return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, msgs, dst, useHash) } func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte, optional ...bool) bool { // useHash n := len(msgs) if n == 0 { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } numCores := runtime.GOMAXPROCS(0) numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding pk,msg[,aug] tuple and // repeat until n is exceeded. The resulting accumulations will be // fed into the msgsCh channel. msgsCh := make(chan Pairing, numThreads) valid := int32(1) curItem := uint32(0) mutex := sync.Mutex{} mutex.Lock() for tid := 0; tid < numThreads; tid++ { go func() { pairing := PairingCtx(useHash, dst) var temp P1Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } else if work == 0 && maxProcs == numCores-1 && numThreads == maxProcs { // Avoid consuming all cores by waiting until the // main thread has completed its miller loop before // proceeding. mutex.Lock() mutex.Unlock() //nolint:staticcheck } // Pull Public Key and augmentation blob curPk, aug := pkFn(work, &temp) if curPk == nil { atomic.StoreInt32(&valid, 0) break } // Pairing and accumulate ret := PairingAggregatePkInG1(pairing, curPk, pkValidate, nil, false, msgs[work], aug) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) break } // application might have some async work to do runtime.Gosched() } if atomic.LoadInt32(&valid) > 0 { PairingCommit(pairing) msgsCh <- pairing } else { msgsCh <- nil } }() } // Uncompress and check signature var gtsig Fp12 sig := sigFn() if sig == nil { atomic.StoreInt32(&valid, 0) } if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && !sig.SigValidate(false) { atomic.StoreInt32(&valid, 0) } if atomic.LoadInt32(&valid) > 0 { C.blst_aggregated_in_g2(>sig.cgo, &sig.cgo) } mutex.Unlock() // Accumulate the thread results var pairings Pairing for i := 0; i < numThreads; i++ { msg := <-msgsCh if msg != nil { if pairings == nil { pairings = msg } else { ret := PairingMerge(pairings, msg) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) } } } } if atomic.LoadInt32(&valid) == 0 || pairings == nil { return false } return PairingFinalVerify(pairings, >sig) } func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool, msg Message, dst []byte, optional ...[]byte) int { var aug []byte if len(optional) > 0 { aug = optional[0] } if runtime.NumGoroutine() < maxProcs { sigFn := func() *P2Affine { return sig } pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) { return pk, aug } if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg}, dst, hash_or_encode) { return C.BLST_VERIFY_FAIL } return C.BLST_SUCCESS } return int(C.blst_core_verify_pk_in_g1(&pk.cgo, &sig.cgo, C.bool(hash_or_encode), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug)))) } // pks are assumed to be verified for proof of possession, // which implies that they are already group-checked func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool, pks []*P1Affine, msg Message, dst []byte, optional ...interface{}) bool { // pass-through to Verify n := len(pks) // TODO: return value for length zero? if n == 0 { return false } aggregator := new(P1Aggregate) if !aggregator.Aggregate(pks, false) { return false } pkAff := aggregator.ToAffine() // Verify return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) } func (*P2Affine) MultipleAggregateVerify(sigs []*P2Affine, sigsGroupcheck bool, pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, randFn func(*Scalar), randBits int, optional ...interface{}) bool { // useHash // Sanity checks and argument parsing n := len(pks) if n == 0 || len(msgs) != n || len(sigs) != n { return false } _, augs, useHash, ok := parseOpts(optional...) useAugs := len(augs) != 0 if !ok || (useAugs && len(augs) != n) { return false } paramsFn := func(work uint32, _ *P2Affine, _ *P1Affine, rand *Scalar) ( *P2Affine, *P1Affine, *Scalar, []byte) { randFn(rand) var aug []byte if useAugs { aug = augs[work] } return sigs[work], pks[work], rand, aug } return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify, msgs, dst, randBits, useHash) } type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine, rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte) func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1, sigsGroupcheck bool, pksVerify bool, msgs []Message, dst []byte, randBits int, optional ...bool) bool { // useHash n := len(msgs) if n == 0 { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding pk,msg[,aug] tuple and // repeat until n is exceeded. The resulting accumulations will be // fed into the msgsCh channel. msgsCh := make(chan Pairing, numThreads) valid := int32(1) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { pairing := PairingCtx(useHash, dst) var tempRand Scalar var tempPk P1Affine var tempSig P2Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } curSig, curPk, curRand, aug := paramsFn(work, &tempSig, &tempPk, &tempRand) if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify, curSig, sigsGroupcheck, curRand, randBits, msgs[work], aug) != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) break } // application might have some async work to do runtime.Gosched() } if atomic.LoadInt32(&valid) > 0 { PairingCommit(pairing) msgsCh <- pairing } else { msgsCh <- nil } }() } // Accumulate the thread results var pairings Pairing for i := 0; i < numThreads; i++ { msg := <-msgsCh if msg != nil { if pairings == nil { pairings = msg } else { ret := PairingMerge(pairings, msg) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) } } } } if atomic.LoadInt32(&valid) == 0 || pairings == nil { return false } return PairingFinalVerify(pairings, nil) } // // Aggregate P2 // type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine type P2Aggregate struct { v *P2 } // Aggregate uncompressed elements func (agg *P2Aggregate) Aggregate(elmts []*P2Affine, groupcheck bool) bool { if len(elmts) == 0 { return true } getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } return agg.coreAggregate(getter, groupcheck, len(elmts)) } func (agg *P2Aggregate) AggregateWithRandomness(pointsIf interface{}, scalarsIf interface{}, nbits int, groupcheck bool) bool { if groupcheck && !P2AffinesValidate(pointsIf) { return false } agg.v = P2AffinesMult(pointsIf, scalarsIf, nbits) return true } // Aggregate compressed elements func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte, groupcheck bool) bool { if len(elmts) == 0 { return true } getter := func(i uint32, p *P2Affine) *P2Affine { bytes := elmts[i] if p.Uncompress(bytes) == nil { return nil } return p } return agg.coreAggregate(getter, groupcheck, len(elmts)) } func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) { if other.v == nil { // do nothing } else if agg.v == nil { agg.v = other.v } else { C.blst_p2_add_or_double(&agg.v.cgo, &agg.v.cgo, &other.v.cgo) } } func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool { if groupcheck && !bool(C.blst_p2_affine_in_g2(&elmt.cgo)) { return false } if agg.v == nil { agg.v = new(P2) C.blst_p2_from_affine(&agg.v.cgo, &elmt.cgo) } else { C.blst_p2_add_or_double_affine(&agg.v.cgo, &agg.v.cgo, &elmt.cgo) } return true } func (agg *P2Aggregate) ToAffine() *P2Affine { if agg.v == nil { return new(P2Affine) } return agg.v.ToAffine() } func (agg *P2Aggregate) coreAggregate(getter aggGetterP2, groupcheck bool, n int) bool { if n == 0 { return true } // operations are considered short enough for not to care about // keeping one core free... numThreads := runtime.GOMAXPROCS(0) if numThreads > n { numThreads = n } valid := int32(1) type result struct { agg *P2 empty bool } msgs := make(chan result, numThreads) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { first := true var agg P2 var temp P2Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } // Signature validate curElmt := getter(work, &temp) if curElmt == nil { atomic.StoreInt32(&valid, 0) break } if groupcheck && !bool(C.blst_p2_affine_in_g2(&curElmt.cgo)) { atomic.StoreInt32(&valid, 0) break } if first { C.blst_p2_from_affine(&agg.cgo, &curElmt.cgo) first = false } else { C.blst_p2_add_or_double_affine(&agg.cgo, &agg.cgo, &curElmt.cgo) } // application might have some async work to do runtime.Gosched() } if first { msgs <- result{nil, true} } else if atomic.LoadInt32(&valid) > 0 { msgs <- result{&agg, false} } else { msgs <- result{nil, false} } }() } // Accumulate the thread results first := agg.v == nil validLocal := true for i := 0; i < numThreads; i++ { msg := <-msgs if !validLocal || msg.empty { // do nothing } else if msg.agg == nil { validLocal = false // This should be unnecessary but seems safer atomic.StoreInt32(&valid, 0) } else { if first { agg.v = msg.agg first = false } else { C.blst_p2_add_or_double(&agg.v.cgo, &agg.v.cgo, &msg.agg.cgo) } } } if atomic.LoadInt32(&valid) == 0 { agg.v = nil return false } return true } // // MIN-SIG // // // PublicKey // func (pk *P2Affine) From(s *Scalar) *P2Affine { C.blst_sk_to_pk2_in_g2(nil, &pk.cgo, &s.cgo) return pk } func (pk *P2Affine) KeyValidate() bool { return bool(C.go_p2_affine_validate(&pk.cgo, true)) } // sigInfcheck, check for infinity, is a way to avoid going // into resource-consuming verification. Passing 'false' is // always cryptographically safe, but application might want // to guard against obviously bogus individual[!] signatures. func (sig *P1Affine) SigValidate(sigInfcheck bool) bool { return bool(C.go_p1_affine_validate(&sig.cgo, C.bool(sigInfcheck))) } // // Sign // func (sig *P1Affine) Sign(sk *SecretKey, msg []byte, dst []byte, optional ...interface{}) *P1Affine { augSingle, aug, useHash, ok := parseOpts(optional...) if !ok || len(aug) != 0 { return nil } var q *P1 if useHash { q = HashToG1(msg, dst, augSingle) } else { q = EncodeToG1(msg, dst, augSingle) } C.blst_sign_pk2_in_g2(nil, &sig.cgo, &q.cgo, &sk.cgo) return sig } // // Signature // // Functions to return a signature and public key+augmentation tuple. // This enables point decompression (if needed) to happen in parallel. type sigGetterP1 func() *P1Affine type pkGetterP2 func(i uint32, temp *P2Affine) (*P2Affine, []byte) // Single verify with decompressed pk func (sig *P1Affine) Verify(sigGroupcheck bool, pk *P2Affine, pkValidate bool, msg Message, dst []byte, optional ...interface{}) bool { // useHash bool, aug []byte aug, _, useHash, ok := parseOpts(optional...) if !ok { return false } return sig.AggregateVerify(sigGroupcheck, []*P2Affine{pk}, pkValidate, []Message{msg}, dst, useHash, [][]byte{aug}) } // Single verify with compressed pk // Uses a dummy signature to get the correct type func (dummy *P1Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, pk []byte, pkValidate bool, msg Message, dst []byte, optional ...bool) bool { // useHash bool, usePksAsAugs bool return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, [][]byte{pk}, pkValidate, []Message{msg}, dst, optional...) } // Aggregate verify with uncompressed signature and public keys // Note that checking message uniqueness, if required, is left to the user. // Not all signature schemes require it and this keeps the binding minimal // and fast. Refer to the Uniq function for one method method of performing // this check. func (sig *P1Affine) AggregateVerify(sigGroupcheck bool, pks []*P2Affine, pksVerify bool, msgs []Message, dst []byte, optional ...interface{}) bool { // useHash bool, augs [][]byte // sanity checks and argument parsing n := len(pks) if n == 0 || len(msgs) != n { return false } _, augs, useHash, ok := parseOpts(optional...) useAugs := len(augs) != 0 if !ok || (useAugs && len(augs) != n) { return false } sigFn := func() *P1Affine { return sig } pkFn := func(i uint32, _ *P2Affine) (*P2Affine, []byte) { if useAugs { return pks[i], augs[i] } return pks[i], nil } return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, msgs, dst, useHash) } // Aggregate verify with compressed signature and public keys // Uses a dummy signature to get the correct type func (*P1Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, pks [][]byte, pksVerify bool, msgs []Message, dst []byte, optional ...bool) bool { // useHash bool, usePksAsAugs bool // sanity checks and argument parsing if len(pks) != len(msgs) { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } usePksAsAugs := false if len(optional) > 1 { usePksAsAugs = optional[1] } sigFn := func() *P1Affine { sigP := new(P1Affine) if sigP.Uncompress(sig) == nil { return nil } return sigP } pkFn := func(i uint32, pk *P2Affine) (*P2Affine, []byte) { bytes := pks[i] if len(bytes) == BLST_P2_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { // Not compressed if pk.Deserialize(bytes) == nil { return nil, nil } } else if len(bytes) == BLST_P2_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { if pk.Uncompress(bytes) == nil { return nil, nil } } else { return nil, nil } if usePksAsAugs { return pk, bytes } return pk, nil } return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, msgs, dst, useHash) } func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool, pkFn pkGetterP2, pkValidate bool, msgs []Message, dst []byte, optional ...bool) bool { // useHash n := len(msgs) if n == 0 { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } numCores := runtime.GOMAXPROCS(0) numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding pk,msg[,aug] tuple and // repeat until n is exceeded. The resulting accumulations will be // fed into the msgsCh channel. msgsCh := make(chan Pairing, numThreads) valid := int32(1) curItem := uint32(0) mutex := sync.Mutex{} mutex.Lock() for tid := 0; tid < numThreads; tid++ { go func() { pairing := PairingCtx(useHash, dst) var temp P2Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } else if work == 0 && maxProcs == numCores-1 && numThreads == maxProcs { // Avoid consuming all cores by waiting until the // main thread has completed its miller loop before // proceeding. mutex.Lock() mutex.Unlock() //nolint:staticcheck } // Pull Public Key and augmentation blob curPk, aug := pkFn(work, &temp) if curPk == nil { atomic.StoreInt32(&valid, 0) break } // Pairing and accumulate ret := PairingAggregatePkInG2(pairing, curPk, pkValidate, nil, false, msgs[work], aug) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) break } // application might have some async work to do runtime.Gosched() } if atomic.LoadInt32(&valid) > 0 { PairingCommit(pairing) msgsCh <- pairing } else { msgsCh <- nil } }() } // Uncompress and check signature var gtsig Fp12 sig := sigFn() if sig == nil { atomic.StoreInt32(&valid, 0) } if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && !sig.SigValidate(false) { atomic.StoreInt32(&valid, 0) } if atomic.LoadInt32(&valid) > 0 { C.blst_aggregated_in_g1(>sig.cgo, &sig.cgo) } mutex.Unlock() // Accumulate the thread results var pairings Pairing for i := 0; i < numThreads; i++ { msg := <-msgsCh if msg != nil { if pairings == nil { pairings = msg } else { ret := PairingMerge(pairings, msg) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) } } } } if atomic.LoadInt32(&valid) == 0 || pairings == nil { return false } return PairingFinalVerify(pairings, >sig) } func CoreVerifyPkInG2(pk *P2Affine, sig *P1Affine, hash_or_encode bool, msg Message, dst []byte, optional ...[]byte) int { var aug []byte if len(optional) > 0 { aug = optional[0] } if runtime.NumGoroutine() < maxProcs { sigFn := func() *P1Affine { return sig } pkFn := func(_ uint32, _ *P2Affine) (*P2Affine, []byte) { return pk, aug } if !coreAggregateVerifyPkInG2(sigFn, true, pkFn, true, []Message{msg}, dst, hash_or_encode) { return C.BLST_VERIFY_FAIL } return C.BLST_SUCCESS } return int(C.blst_core_verify_pk_in_g2(&pk.cgo, &sig.cgo, C.bool(hash_or_encode), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug)))) } // pks are assumed to be verified for proof of possession, // which implies that they are already group-checked func (sig *P1Affine) FastAggregateVerify(sigGroupcheck bool, pks []*P2Affine, msg Message, dst []byte, optional ...interface{}) bool { // pass-through to Verify n := len(pks) // TODO: return value for length zero? if n == 0 { return false } aggregator := new(P2Aggregate) if !aggregator.Aggregate(pks, false) { return false } pkAff := aggregator.ToAffine() // Verify return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) } func (*P1Affine) MultipleAggregateVerify(sigs []*P1Affine, sigsGroupcheck bool, pks []*P2Affine, pksVerify bool, msgs []Message, dst []byte, randFn func(*Scalar), randBits int, optional ...interface{}) bool { // useHash // Sanity checks and argument parsing n := len(pks) if n == 0 || len(msgs) != n || len(sigs) != n { return false } _, augs, useHash, ok := parseOpts(optional...) useAugs := len(augs) != 0 if !ok || (useAugs && len(augs) != n) { return false } paramsFn := func(work uint32, _ *P1Affine, _ *P2Affine, rand *Scalar) ( *P1Affine, *P2Affine, *Scalar, []byte) { randFn(rand) var aug []byte if useAugs { aug = augs[work] } return sigs[work], pks[work], rand, aug } return multipleAggregateVerifyPkInG2(paramsFn, sigsGroupcheck, pksVerify, msgs, dst, randBits, useHash) } type mulAggGetterPkInG2 func(work uint32, sig *P1Affine, pk *P2Affine, rand *Scalar) (*P1Affine, *P2Affine, *Scalar, []byte) func multipleAggregateVerifyPkInG2(paramsFn mulAggGetterPkInG2, sigsGroupcheck bool, pksVerify bool, msgs []Message, dst []byte, randBits int, optional ...bool) bool { // useHash n := len(msgs) if n == 0 { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding pk,msg[,aug] tuple and // repeat until n is exceeded. The resulting accumulations will be // fed into the msgsCh channel. msgsCh := make(chan Pairing, numThreads) valid := int32(1) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { pairing := PairingCtx(useHash, dst) var tempRand Scalar var tempPk P2Affine var tempSig P1Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } curSig, curPk, curRand, aug := paramsFn(work, &tempSig, &tempPk, &tempRand) if PairingMulNAggregatePkInG2(pairing, curPk, pksVerify, curSig, sigsGroupcheck, curRand, randBits, msgs[work], aug) != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) break } // application might have some async work to do runtime.Gosched() } if atomic.LoadInt32(&valid) > 0 { PairingCommit(pairing) msgsCh <- pairing } else { msgsCh <- nil } }() } // Accumulate the thread results var pairings Pairing for i := 0; i < numThreads; i++ { msg := <-msgsCh if msg != nil { if pairings == nil { pairings = msg } else { ret := PairingMerge(pairings, msg) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) } } } } if atomic.LoadInt32(&valid) == 0 || pairings == nil { return false } return PairingFinalVerify(pairings, nil) } // // Aggregate P1 // type aggGetterP1 func(i uint32, temp *P1Affine) *P1Affine type P1Aggregate struct { v *P1 } // Aggregate uncompressed elements func (agg *P1Aggregate) Aggregate(elmts []*P1Affine, groupcheck bool) bool { if len(elmts) == 0 { return true } getter := func(i uint32, _ *P1Affine) *P1Affine { return elmts[i] } return agg.coreAggregate(getter, groupcheck, len(elmts)) } func (agg *P1Aggregate) AggregateWithRandomness(pointsIf interface{}, scalarsIf interface{}, nbits int, groupcheck bool) bool { if groupcheck && !P1AffinesValidate(pointsIf) { return false } agg.v = P1AffinesMult(pointsIf, scalarsIf, nbits) return true } // Aggregate compressed elements func (agg *P1Aggregate) AggregateCompressed(elmts [][]byte, groupcheck bool) bool { if len(elmts) == 0 { return true } getter := func(i uint32, p *P1Affine) *P1Affine { bytes := elmts[i] if p.Uncompress(bytes) == nil { return nil } return p } return agg.coreAggregate(getter, groupcheck, len(elmts)) } func (agg *P1Aggregate) AddAggregate(other *P1Aggregate) { if other.v == nil { // do nothing } else if agg.v == nil { agg.v = other.v } else { C.blst_p1_add_or_double(&agg.v.cgo, &agg.v.cgo, &other.v.cgo) } } func (agg *P1Aggregate) Add(elmt *P1Affine, groupcheck bool) bool { if groupcheck && !bool(C.blst_p1_affine_in_g1(&elmt.cgo)) { return false } if agg.v == nil { agg.v = new(P1) C.blst_p1_from_affine(&agg.v.cgo, &elmt.cgo) } else { C.blst_p1_add_or_double_affine(&agg.v.cgo, &agg.v.cgo, &elmt.cgo) } return true } func (agg *P1Aggregate) ToAffine() *P1Affine { if agg.v == nil { return new(P1Affine) } return agg.v.ToAffine() } func (agg *P1Aggregate) coreAggregate(getter aggGetterP1, groupcheck bool, n int) bool { if n == 0 { return true } // operations are considered short enough for not to care about // keeping one core free... numThreads := runtime.GOMAXPROCS(0) if numThreads > n { numThreads = n } valid := int32(1) type result struct { agg *P1 empty bool } msgs := make(chan result, numThreads) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { first := true var agg P1 var temp P1Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } // Signature validate curElmt := getter(work, &temp) if curElmt == nil { atomic.StoreInt32(&valid, 0) break } if groupcheck && !bool(C.blst_p1_affine_in_g1(&curElmt.cgo)) { atomic.StoreInt32(&valid, 0) break } if first { C.blst_p1_from_affine(&agg.cgo, &curElmt.cgo) first = false } else { C.blst_p1_add_or_double_affine(&agg.cgo, &agg.cgo, &curElmt.cgo) } // application might have some async work to do runtime.Gosched() } if first { msgs <- result{nil, true} } else if atomic.LoadInt32(&valid) > 0 { msgs <- result{&agg, false} } else { msgs <- result{nil, false} } }() } // Accumulate the thread results first := agg.v == nil validLocal := true for i := 0; i < numThreads; i++ { msg := <-msgs if !validLocal || msg.empty { // do nothing } else if msg.agg == nil { validLocal = false // This should be unnecessary but seems safer atomic.StoreInt32(&valid, 0) } else { if first { agg.v = msg.agg first = false } else { C.blst_p1_add_or_double(&agg.v.cgo, &agg.v.cgo, &msg.agg.cgo) } } } if atomic.LoadInt32(&valid) == 0 { agg.v = nil return false } return true } func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, sig *P2Affine, sigGroupcheck bool, msg []byte, optional ...[]byte) int { // aug var aug []byte if len(optional) > 0 { aug = optional[0] } r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0], PK.asPtr(), C.bool(pkValidate), sig.asPtr(), C.bool(sigGroupcheck), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(aug), C.size_t(len(aug))) return int(r) } func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, sig *P2Affine, sigGroupcheck bool, rand *Scalar, randBits int, msg []byte, optional ...[]byte) int { // aug var aug []byte if len(optional) > 0 { aug = optional[0] } r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0], PK.asPtr(), C.bool(pkValidate), sig.asPtr(), C.bool(sigGroupcheck), &rand.cgo.b[0], C.size_t(randBits), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(aug), C.size_t(len(aug))) return int(r) } // // Serialization/Deserialization. // // P1 Serdes func (p1 *P1Affine) Serialize() []byte { var out [BLST_P1_SERIALIZE_BYTES]byte C.blst_p1_affine_serialize((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { if len(in) != BLST_P1_SERIALIZE_BYTES { return nil } if C.blst_p1_deserialize(&p1.cgo, (*C.byte)(&in[0])) != C.BLST_SUCCESS { return nil } return p1 } func (p1 *P1Affine) Compress() []byte { var out [BLST_P1_COMPRESS_BYTES]byte C.blst_p1_affine_compress((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { if len(in) != BLST_P1_COMPRESS_BYTES { return nil } if C.blst_p1_uncompress(&p1.cgo, (*C.byte)(&in[0])) != C.BLST_SUCCESS { return nil } return p1 } func (p1 *P1Affine) InG1() bool { return bool(C.blst_p1_affine_in_g1(&p1.cgo)) } func (*P1Affine) BatchUncompress(in [][]byte) []*P1Affine { // Allocate space for all of the resulting points. Later we'll save pointers // and return those so that the result could be used in other functions, // such as MultipleAggregateVerify. n := len(in) points := make([]P1Affine, n) pointsPtrs := make([]*P1Affine, n) numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding point, and // repeat until n is exceeded. Each thread will send a result (true for // success, false for failure) into the channel when complete. resCh := make(chan bool, numThreads) valid := int32(1) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } if points[work].Uncompress(in[work]) == nil { atomic.StoreInt32(&valid, 0) break } pointsPtrs[work] = &points[work] } if atomic.LoadInt32(&valid) > 0 { resCh <- true } else { resCh <- false } }() } // Collect the threads result := true for i := 0; i < numThreads; i++ { if !<-resCh { result = false } } if atomic.LoadInt32(&valid) == 0 || !result { return nil } return pointsPtrs } func (p1 *P1) Serialize() []byte { var out [BLST_P1_SERIALIZE_BYTES]byte C.blst_p1_serialize((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1) Compress() []byte { var out [BLST_P1_COMPRESS_BYTES]byte C.blst_p1_compress((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 { var nbits int var scalar *C.byte switch val := scalarIf.(type) { case []byte: scalar = (*C.byte)(&val[0]) nbits = len(val) * 8 case *Scalar: scalar = &val.cgo.b[0] nbits = 255 default: panic(fmt.Sprintf("unsupported type %T", val)) } if len(optional) > 0 { nbits = optional[0] } C.blst_p1_mult(&p1.cgo, &p1.cgo, scalar, C.size_t(nbits)) return p1 } func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 { ret := *p1 return ret.MultAssign(scalarIf, optional...) } func (p1 *P1) AddAssign(pointIf interface{}) *P1 { switch val := pointIf.(type) { case *P1: C.blst_p1_add_or_double(&p1.cgo, &p1.cgo, &val.cgo) case *P1Affine: C.blst_p1_add_or_double_affine(&p1.cgo, &p1.cgo, &val.cgo) default: panic(fmt.Sprintf("unsupported type %T", val)) } return p1 } func (p1 *P1) Add(pointIf interface{}) *P1 { ret := *p1 return ret.AddAssign(pointIf) } func (p1 *P1) SubAssign(pointIf interface{}) *P1 { var x *C.blst_fp var affine C.bool switch val := pointIf.(type) { case *P1: x = &val.cgo.x affine = false case *P1Affine: x = &val.cgo.x affine = true default: panic(fmt.Sprintf("unsupported type %T", val)) } C.go_p1_sub_assign(&p1.cgo, x, affine) return p1 } func (p1 *P1) Sub(pointIf interface{}) *P1 { ret := *p1 return ret.SubAssign(pointIf) } func P1Generator() *P1 { return &cgo_p1Generator } // 'acc += point * scalar', passing 'nil' for 'point' means "use the // // group generator point" func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{}, optional ...int) *P1 { var x *C.blst_fp var affine C.bool if pointIf != nil { switch val := pointIf.(type) { case *P1: x = &val.cgo.x affine = false case *P1Affine: x = &val.cgo.x affine = true default: panic(fmt.Sprintf("unsupported type %T", val)) } } var nbits int var scalar *C.byte switch val := scalarIf.(type) { case []byte: scalar = (*C.byte)(&val[0]) nbits = len(val) * 8 case *Scalar: scalar = &val.cgo.b[0] nbits = 255 default: panic(fmt.Sprintf("unsupported type %T", val)) } if len(optional) > 0 { nbits = optional[0] } C.go_p1_mult_n_acc(&acc.cgo, x, affine, scalar, C.size_t(nbits)) return acc } // // Affine // func (p *P1) ToAffine() *P1Affine { var pa P1Affine C.blst_p1_to_affine(&pa.cgo, &p.cgo) return &pa } func (p *P1) FromAffine(pa *P1Affine) { C.blst_p1_from_affine(&p.cgo, &pa.cgo) } // Hash func HashToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { // aug var q P1 var aug []byte if len(optional) > 0 { aug = optional[0] } C.blst_hash_to_g1(&q.cgo, ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug))) return &q } func EncodeToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { // aug var q P1 var aug []byte if len(optional) > 0 { aug = optional[0] } C.blst_encode_to_g1(&q.cgo, ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug))) return &q } // // Multi-point/scalar operations // func P1sToAffine(points []*P1, optional ...int) P1Affines { var npoints int if len(optional) > 0 { npoints = optional[0] } else { npoints = len(points) } ret := make([]P1Affine, npoints) _cgoCheckPointer := func(...interface{}) {} C.blst_p1s_to_affine(&ret[0].cgo, (**C.blst_p1)(unsafe.Pointer(&points[0])), C.size_t(npoints)) return ret } func (points P1s) ToAffine(optional ...P1Affines) P1Affines { npoints := len(points) var ret P1Affines if len(optional) > 0 { // used in benchmark ret = optional[0] if len(ret) < npoints { panic("npoints mismatch") } } else { ret = make([]P1Affine, npoints) } if maxProcs < 2 || npoints < 768 { C.go_p1slice_to_affine(&ret[0].cgo, &points[0].cgo, C.size_t(npoints)) return ret } nslices := (npoints + 511) / 512 if nslices > maxProcs { nslices = maxProcs } delta, rem := npoints/nslices+1, npoints%nslices var wg sync.WaitGroup wg.Add(nslices) for x := 0; x < npoints; x += delta { if rem == 0 { delta -= 1 } rem -= 1 go func(out *P1Affine, inp *P1, delta int) { C.go_p1slice_to_affine(&out.cgo, &inp.cgo, C.size_t(delta)) wg.Done() }(&ret[x], &points[x], delta) } wg.Wait() return ret } // // Batch addition // func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 { var npoints int if len(optional) > 0 { npoints = optional[0] } else { npoints = len(points) } var ret P1 _cgoCheckPointer := func(...interface{}) {} C.blst_p1s_add(&ret.cgo, (**C.blst_p1_affine)(unsafe.Pointer(&points[0])), C.size_t(npoints)) return &ret } func (points P1Affines) Add() *P1 { npoints := len(points) if maxProcs < 2 || npoints < 768 { var ret P1 C.go_p1slice_add(&ret.cgo, &points[0].cgo, C.size_t(npoints)) return &ret } nslices := (npoints + 511) / 512 if nslices > maxProcs { nslices = maxProcs } delta, rem := npoints/nslices+1, npoints%nslices msgs := make(chan P1, nslices) for x := 0; x < npoints; x += delta { if rem == 0 { delta -= 1 } rem -= 1 go func(points *P1Affine, delta int) { var ret P1 C.go_p1slice_add(&ret.cgo, &points.cgo, C.size_t(delta)) msgs <- ret }(&points[x], delta) } ret := <-msgs for i := 1; i < nslices; i++ { msg := <-msgs C.blst_p1_add_or_double(&ret.cgo, &ret.cgo, &msg.cgo) } return &ret } func (points P1s) Add() *P1 { return points.ToAffine().Add() } // // Multi-scalar multiplication // func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 { var npoints int switch val := pointsIf.(type) { case []*P1Affine: npoints = len(val) case []P1Affine: npoints = len(val) case P1Affines: npoints = len(val) default: panic(fmt.Sprintf("unsupported type %T", val)) } nbytes := (nbits + 7) / 8 var scalars []*C.byte switch val := scalarsIf.(type) { case []byte: if len(val) < npoints*nbytes { return nil } case [][]byte: if len(val) < npoints { return nil } scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = (*C.byte)(&val[i][0]) } case []Scalar: if len(val) < npoints { return nil } if nbits <= 248 { scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = &val[i].cgo.b[0] } } case []*Scalar: if len(val) < npoints { return nil } scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = &val[i].cgo.b[0] } default: panic(fmt.Sprintf("unsupported type %T", val)) } numThreads := numThreads(0) if numThreads < 2 { sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 scratch := make([]uint64, sz) pointsBySlice := [2]*C.blst_p1_affine{nil, nil} var p_points **C.blst_p1_affine switch val := pointsIf.(type) { case []*P1Affine: p_points = (**C.blst_p1_affine)(unsafe.Pointer(&val[0])) case []P1Affine: pointsBySlice[0] = &val[0].cgo p_points = &pointsBySlice[0] case P1Affines: pointsBySlice[0] = &val[0].cgo p_points = &pointsBySlice[0] default: // type is already vetted } scalarsBySlice := [2]*C.byte{nil, nil} var p_scalars **C.byte switch val := scalarsIf.(type) { case []byte: scalarsBySlice[0] = (*C.byte)(&val[0]) p_scalars = &scalarsBySlice[0] case [][]byte: p_scalars = &scalars[0] case []Scalar: if nbits > 248 { scalarsBySlice[0] = &val[0].cgo.b[0] p_scalars = &scalarsBySlice[0] } else { p_scalars = &scalars[0] } case []*Scalar: p_scalars = &scalars[0] default: // type is already vetted } var ret P1 _cgoCheckPointer := func(...interface{}) {} C.blst_p1s_mult_pippenger(&ret.cgo, p_points, C.size_t(npoints), p_scalars, C.size_t(nbits), (*C.limb_t)(&scratch[0])) for i := range scalars { scalars[i] = nil } return &ret } if npoints < 32 { if numThreads > npoints { numThreads = npoints } curItem := uint32(0) msgs := make(chan P1, numThreads) for tid := 0; tid < numThreads; tid++ { go func() { var acc P1 for { workItem := int(atomic.AddUint32(&curItem, 1) - 1) if workItem >= npoints { break } var point *P1Affine switch val := pointsIf.(type) { case []*P1Affine: point = val[workItem] case []P1Affine: point = &val[workItem] case P1Affines: point = &val[workItem] default: // type is already vetted } var scalar *C.byte switch val := scalarsIf.(type) { case []byte: scalar = (*C.byte)(&val[workItem*nbytes]) case [][]byte: scalar = scalars[workItem] case []Scalar: if nbits > 248 { scalar = &val[workItem].cgo.b[0] } else { scalar = scalars[workItem] } case []*Scalar: scalar = scalars[workItem] default: // type is already vetted } C.go_p1_mult_n_acc(&acc.cgo, &point.cgo.x, true, scalar, C.size_t(nbits)) } msgs <- acc }() } ret := <-msgs for tid := 1; tid < numThreads; tid++ { point := <-msgs C.blst_p1_add_or_double(&ret.cgo, &ret.cgo, &point.cgo) } for i := range scalars { scalars[i] = nil } return &ret } // this is sizeof(scratch[0]) sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0)) / 8 nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), numThreads) // |grid[]| holds "coordinates" and place for result grid := make([]struct { x, dx, y, dy int point P1 }, nx*ny) dx := npoints / nx y := window * (ny - 1) total := 0 for ; total < nx; total++ { grid[total].x = total * dx grid[total].dx = dx grid[total].y = y grid[total].dy = nbits - y } grid[total-1].dx = npoints - grid[total-1].x for y > 0 { y -= window for i := 0; i < nx; i++ { grid[total].x = grid[i].x grid[total].dx = grid[i].dx grid[total].y = y grid[total].dy = window total++ } } if numThreads > total { numThreads = total } msgsCh := make(chan int, ny) rowSync := make([]int32, ny) // count up to |nx| curItem := int32(0) for tid := 0; tid < numThreads; tid++ { go func() { scratch := make([]uint64, sz<= total { break } x := grid[workItem].x y := grid[workItem].y var p_points **C.blst_p1_affine switch val := pointsIf.(type) { case []*P1Affine: p_points = (**C.blst_p1_affine)(unsafe.Pointer(&val[x])) case []P1Affine: pointsBySlice[0] = &val[x].cgo p_points = &pointsBySlice[0] case P1Affines: pointsBySlice[0] = &val[x].cgo p_points = &pointsBySlice[0] default: // type is already vetted } var p_scalars **C.byte switch val := scalarsIf.(type) { case []byte: scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) p_scalars = &scalarsBySlice[0] case [][]byte: p_scalars = &scalars[x] case []Scalar: if nbits > 248 { scalarsBySlice[0] = &val[x].cgo.b[0] p_scalars = &scalarsBySlice[0] } else { p_scalars = &scalars[x] } case []*Scalar: p_scalars = &scalars[x] default: // type is already vetted } C.blst_p1s_tile_pippenger(&grid[workItem].point.cgo, p_points, C.size_t(grid[workItem].dx), p_scalars, C.size_t(nbits), (*C.limb_t)(&scratch[0]), C.size_t(y), C.size_t(window)) if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { msgsCh <- y // "row" is done } else { runtime.Gosched() // be nice to the application } } pointsBySlice[0] = nil scalarsBySlice[0] = nil }() } var ret P1 rows := make([]bool, ny) row := 0 // actually index in |grid[]| for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" y := <-msgsCh rows[y/window] = true // mark the "row" for grid[row].y == y { // if it's current "row", process it for row < total && grid[row].y == y { C.blst_p1_add_or_double(&ret.cgo, &ret.cgo, &grid[row].point.cgo) row++ } if y == 0 { break // one can as well 'return &ret' here } for j := 0; j < window; j++ { C.blst_p1_double(&ret.cgo, &ret.cgo) } y -= window if !rows[y/window] { // see if next "row" was marked already break } } } for i := range scalars { scalars[i] = nil } return &ret } func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 { return P1AffinesMult(points, scalarsIf, nbits) } func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 { return points.ToAffine().Mult(scalarsIf, nbits) } // // Group-check // func P1AffinesValidate(pointsIf interface{}) bool { var npoints int switch val := pointsIf.(type) { case []*P1Affine: npoints = len(val) case []P1Affine: npoints = len(val) case P1Affines: npoints = len(val) default: panic(fmt.Sprintf("unsupported type %T", val)) } numThreads := numThreads(npoints) if numThreads < 2 { for i := 0; i < npoints; i++ { var point *P1Affine switch val := pointsIf.(type) { case []*P1Affine: point = val[i] case []P1Affine: point = &val[i] case P1Affines: point = &val[i] default: panic(fmt.Sprintf("unsupported type %T", val)) } if !C.go_p1_affine_validate(&point.cgo, true) { return false } } return true } valid := int32(1) curItem := uint32(0) var wg sync.WaitGroup wg.Add(numThreads) for tid := 0; tid < numThreads; tid++ { go func() { for atomic.LoadInt32(&valid) != 0 { work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(npoints) { break } var point *P1Affine switch val := pointsIf.(type) { case []*P1Affine: point = val[work] case []P1Affine: point = &val[work] case P1Affines: point = &val[work] default: panic(fmt.Sprintf("unsupported type %T", val)) } if !C.go_p1_affine_validate(&point.cgo, true) { atomic.StoreInt32(&valid, 0) break } } wg.Done() }() } wg.Wait() return atomic.LoadInt32(&valid) != 0 } func (points P1Affines) Validate() bool { return P1AffinesValidate(points) } func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, sig *P1Affine, sigGroupcheck bool, msg []byte, optional ...[]byte) int { // aug var aug []byte if len(optional) > 0 { aug = optional[0] } r := C.blst_pairing_chk_n_aggr_pk_in_g2(&ctx[0], PK.asPtr(), C.bool(pkValidate), sig.asPtr(), C.bool(sigGroupcheck), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(aug), C.size_t(len(aug))) return int(r) } func PairingMulNAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, sig *P1Affine, sigGroupcheck bool, rand *Scalar, randBits int, msg []byte, optional ...[]byte) int { // aug var aug []byte if len(optional) > 0 { aug = optional[0] } r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g2(&ctx[0], PK.asPtr(), C.bool(pkValidate), sig.asPtr(), C.bool(sigGroupcheck), &rand.cgo.b[0], C.size_t(randBits), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(aug), C.size_t(len(aug))) return int(r) } // // Serialization/Deserialization. // // P2 Serdes func (p2 *P2Affine) Serialize() []byte { var out [BLST_P2_SERIALIZE_BYTES]byte C.blst_p2_affine_serialize((*C.byte)(&out[0]), &p2.cgo) return out[:] } func (p2 *P2Affine) Deserialize(in []byte) *P2Affine { if len(in) != BLST_P2_SERIALIZE_BYTES { return nil } if C.blst_p2_deserialize(&p2.cgo, (*C.byte)(&in[0])) != C.BLST_SUCCESS { return nil } return p2 } func (p2 *P2Affine) Compress() []byte { var out [BLST_P2_COMPRESS_BYTES]byte C.blst_p2_affine_compress((*C.byte)(&out[0]), &p2.cgo) return out[:] } func (p2 *P2Affine) Uncompress(in []byte) *P2Affine { if len(in) != BLST_P2_COMPRESS_BYTES { return nil } if C.blst_p2_uncompress(&p2.cgo, (*C.byte)(&in[0])) != C.BLST_SUCCESS { return nil } return p2 } func (p2 *P2Affine) InG2() bool { return bool(C.blst_p2_affine_in_g2(&p2.cgo)) } func (*P2Affine) BatchUncompress(in [][]byte) []*P2Affine { // Allocate space for all of the resulting points. Later we'll save pointers // and return those so that the result could be used in other functions, // such as MultipleAggregateVerify. n := len(in) points := make([]P2Affine, n) pointsPtrs := make([]*P2Affine, n) numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding point, and // repeat until n is exceeded. Each thread will send a result (true for // success, false for failure) into the channel when complete. resCh := make(chan bool, numThreads) valid := int32(1) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } if points[work].Uncompress(in[work]) == nil { atomic.StoreInt32(&valid, 0) break } pointsPtrs[work] = &points[work] } if atomic.LoadInt32(&valid) > 0 { resCh <- true } else { resCh <- false } }() } // Collect the threads result := true for i := 0; i < numThreads; i++ { if !<-resCh { result = false } } if atomic.LoadInt32(&valid) == 0 || !result { return nil } return pointsPtrs } func (p2 *P2) Serialize() []byte { var out [BLST_P2_SERIALIZE_BYTES]byte C.blst_p2_serialize((*C.byte)(&out[0]), &p2.cgo) return out[:] } func (p2 *P2) Compress() []byte { var out [BLST_P2_COMPRESS_BYTES]byte C.blst_p2_compress((*C.byte)(&out[0]), &p2.cgo) return out[:] } func (p2 *P2) MultAssign(scalarIf interface{}, optional ...int) *P2 { var nbits int var scalar *C.byte switch val := scalarIf.(type) { case []byte: scalar = (*C.byte)(&val[0]) nbits = len(val) * 8 case *Scalar: scalar = &val.cgo.b[0] nbits = 255 default: panic(fmt.Sprintf("unsupported type %T", val)) } if len(optional) > 0 { nbits = optional[0] } C.blst_p2_mult(&p2.cgo, &p2.cgo, scalar, C.size_t(nbits)) return p2 } func (p2 *P2) Mult(scalarIf interface{}, optional ...int) *P2 { ret := *p2 return ret.MultAssign(scalarIf, optional...) } func (p2 *P2) AddAssign(pointIf interface{}) *P2 { switch val := pointIf.(type) { case *P2: C.blst_p2_add_or_double(&p2.cgo, &p2.cgo, &val.cgo) case *P2Affine: C.blst_p2_add_or_double_affine(&p2.cgo, &p2.cgo, &val.cgo) default: panic(fmt.Sprintf("unsupported type %T", val)) } return p2 } func (p2 *P2) Add(pointIf interface{}) *P2 { ret := *p2 return ret.AddAssign(pointIf) } func (p2 *P2) SubAssign(pointIf interface{}) *P2 { var x *C.blst_fp2 var affine C.bool switch val := pointIf.(type) { case *P2: x = &val.cgo.x affine = false case *P2Affine: x = &val.cgo.x affine = true default: panic(fmt.Sprintf("unsupported type %T", val)) } C.go_p2_sub_assign(&p2.cgo, x, affine) return p2 } func (p2 *P2) Sub(pointIf interface{}) *P2 { ret := *p2 return ret.SubAssign(pointIf) } func P2Generator() *P2 { return &cgo_p2Generator } // 'acc += point * scalar', passing 'nil' for 'point' means "use the // // group generator point" func (acc *P2) MultNAccumulate(pointIf interface{}, scalarIf interface{}, optional ...int) *P2 { var x *C.blst_fp2 var affine C.bool if pointIf != nil { switch val := pointIf.(type) { case *P2: x = &val.cgo.x affine = false case *P2Affine: x = &val.cgo.x affine = true default: panic(fmt.Sprintf("unsupported type %T", val)) } } var nbits int var scalar *C.byte switch val := scalarIf.(type) { case []byte: scalar = (*C.byte)(&val[0]) nbits = len(val) * 8 case *Scalar: scalar = &val.cgo.b[0] nbits = 255 default: panic(fmt.Sprintf("unsupported type %T", val)) } if len(optional) > 0 { nbits = optional[0] } C.go_p2_mult_n_acc(&acc.cgo, x, affine, scalar, C.size_t(nbits)) return acc } // // Affine // func (p *P2) ToAffine() *P2Affine { var pa P2Affine C.blst_p2_to_affine(&pa.cgo, &p.cgo) return &pa } func (p *P2) FromAffine(pa *P2Affine) { C.blst_p2_from_affine(&p.cgo, &pa.cgo) } // Hash func HashToG2(msg []byte, dst []byte, optional ...[]byte) *P2 { // aug var q P2 var aug []byte if len(optional) > 0 { aug = optional[0] } C.blst_hash_to_g2(&q.cgo, ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug))) return &q } func EncodeToG2(msg []byte, dst []byte, optional ...[]byte) *P2 { // aug var q P2 var aug []byte if len(optional) > 0 { aug = optional[0] } C.blst_encode_to_g2(&q.cgo, ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug))) return &q } // // Multi-point/scalar operations // func P2sToAffine(points []*P2, optional ...int) P2Affines { var npoints int if len(optional) > 0 { npoints = optional[0] } else { npoints = len(points) } ret := make([]P2Affine, npoints) _cgoCheckPointer := func(...interface{}) {} C.blst_p2s_to_affine(&ret[0].cgo, (**C.blst_p2)(unsafe.Pointer(&points[0])), C.size_t(npoints)) return ret } func (points P2s) ToAffine(optional ...P2Affines) P2Affines { npoints := len(points) var ret P2Affines if len(optional) > 0 { // used in benchmark ret = optional[0] if len(ret) < npoints { panic("npoints mismatch") } } else { ret = make([]P2Affine, npoints) } if maxProcs < 2 || npoints < 768 { C.go_p2slice_to_affine(&ret[0].cgo, &points[0].cgo, C.size_t(npoints)) return ret } nslices := (npoints + 511) / 512 if nslices > maxProcs { nslices = maxProcs } delta, rem := npoints/nslices+1, npoints%nslices var wg sync.WaitGroup wg.Add(nslices) for x := 0; x < npoints; x += delta { if rem == 0 { delta -= 1 } rem -= 1 go func(out *P2Affine, inp *P2, delta int) { C.go_p2slice_to_affine(&out.cgo, &inp.cgo, C.size_t(delta)) wg.Done() }(&ret[x], &points[x], delta) } wg.Wait() return ret } // // Batch addition // func P2AffinesAdd(points []*P2Affine, optional ...int) *P2 { var npoints int if len(optional) > 0 { npoints = optional[0] } else { npoints = len(points) } var ret P2 _cgoCheckPointer := func(...interface{}) {} C.blst_p2s_add(&ret.cgo, (**C.blst_p2_affine)(unsafe.Pointer(&points[0])), C.size_t(npoints)) return &ret } func (points P2Affines) Add() *P2 { npoints := len(points) if maxProcs < 2 || npoints < 768 { var ret P2 C.go_p2slice_add(&ret.cgo, &points[0].cgo, C.size_t(npoints)) return &ret } nslices := (npoints + 511) / 512 if nslices > maxProcs { nslices = maxProcs } delta, rem := npoints/nslices+1, npoints%nslices msgs := make(chan P2, nslices) for x := 0; x < npoints; x += delta { if rem == 0 { delta -= 1 } rem -= 1 go func(points *P2Affine, delta int) { var ret P2 C.go_p2slice_add(&ret.cgo, &points.cgo, C.size_t(delta)) msgs <- ret }(&points[x], delta) } ret := <-msgs for i := 1; i < nslices; i++ { msg := <-msgs C.blst_p2_add_or_double(&ret.cgo, &ret.cgo, &msg.cgo) } return &ret } func (points P2s) Add() *P2 { return points.ToAffine().Add() } // // Multi-scalar multiplication // func P2AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P2 { var npoints int switch val := pointsIf.(type) { case []*P2Affine: npoints = len(val) case []P2Affine: npoints = len(val) case P2Affines: npoints = len(val) default: panic(fmt.Sprintf("unsupported type %T", val)) } nbytes := (nbits + 7) / 8 var scalars []*C.byte switch val := scalarsIf.(type) { case []byte: if len(val) < npoints*nbytes { return nil } case [][]byte: if len(val) < npoints { return nil } scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = (*C.byte)(&val[i][0]) } case []Scalar: if len(val) < npoints { return nil } if nbits <= 248 { scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = &val[i].cgo.b[0] } } case []*Scalar: if len(val) < npoints { return nil } scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = &val[i].cgo.b[0] } default: panic(fmt.Sprintf("unsupported type %T", val)) } numThreads := numThreads(0) if numThreads < 2 { sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 scratch := make([]uint64, sz) pointsBySlice := [2]*C.blst_p2_affine{nil, nil} var p_points **C.blst_p2_affine switch val := pointsIf.(type) { case []*P2Affine: p_points = (**C.blst_p2_affine)(unsafe.Pointer(&val[0])) case []P2Affine: pointsBySlice[0] = &val[0].cgo p_points = &pointsBySlice[0] case P2Affines: pointsBySlice[0] = &val[0].cgo p_points = &pointsBySlice[0] default: // type is already vetted } scalarsBySlice := [2]*C.byte{nil, nil} var p_scalars **C.byte switch val := scalarsIf.(type) { case []byte: scalarsBySlice[0] = (*C.byte)(&val[0]) p_scalars = &scalarsBySlice[0] case [][]byte: p_scalars = &scalars[0] case []Scalar: if nbits > 248 { scalarsBySlice[0] = &val[0].cgo.b[0] p_scalars = &scalarsBySlice[0] } else { p_scalars = &scalars[0] } case []*Scalar: p_scalars = &scalars[0] default: // type is already vetted } var ret P2 _cgoCheckPointer := func(...interface{}) {} C.blst_p2s_mult_pippenger(&ret.cgo, p_points, C.size_t(npoints), p_scalars, C.size_t(nbits), (*C.limb_t)(&scratch[0])) for i := range scalars { scalars[i] = nil } return &ret } if npoints < 32 { if numThreads > npoints { numThreads = npoints } curItem := uint32(0) msgs := make(chan P2, numThreads) for tid := 0; tid < numThreads; tid++ { go func() { var acc P2 for { workItem := int(atomic.AddUint32(&curItem, 1) - 1) if workItem >= npoints { break } var point *P2Affine switch val := pointsIf.(type) { case []*P2Affine: point = val[workItem] case []P2Affine: point = &val[workItem] case P2Affines: point = &val[workItem] default: // type is already vetted } var scalar *C.byte switch val := scalarsIf.(type) { case []byte: scalar = (*C.byte)(&val[workItem*nbytes]) case [][]byte: scalar = scalars[workItem] case []Scalar: if nbits > 248 { scalar = &val[workItem].cgo.b[0] } else { scalar = scalars[workItem] } case []*Scalar: scalar = scalars[workItem] default: // type is already vetted } C.go_p2_mult_n_acc(&acc.cgo, &point.cgo.x, true, scalar, C.size_t(nbits)) } msgs <- acc }() } ret := <-msgs for tid := 1; tid < numThreads; tid++ { point := <-msgs C.blst_p2_add_or_double(&ret.cgo, &ret.cgo, &point.cgo) } for i := range scalars { scalars[i] = nil } return &ret } // this is sizeof(scratch[0]) sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(0)) / 8 nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), numThreads) // |grid[]| holds "coordinates" and place for result grid := make([]struct { x, dx, y, dy int point P2 }, nx*ny) dx := npoints / nx y := window * (ny - 1) total := 0 for ; total < nx; total++ { grid[total].x = total * dx grid[total].dx = dx grid[total].y = y grid[total].dy = nbits - y } grid[total-1].dx = npoints - grid[total-1].x for y > 0 { y -= window for i := 0; i < nx; i++ { grid[total].x = grid[i].x grid[total].dx = grid[i].dx grid[total].y = y grid[total].dy = window total++ } } if numThreads > total { numThreads = total } msgsCh := make(chan int, ny) rowSync := make([]int32, ny) // count up to |nx| curItem := int32(0) for tid := 0; tid < numThreads; tid++ { go func() { scratch := make([]uint64, sz<= total { break } x := grid[workItem].x y := grid[workItem].y var p_points **C.blst_p2_affine switch val := pointsIf.(type) { case []*P2Affine: p_points = (**C.blst_p2_affine)(unsafe.Pointer(&val[x])) case []P2Affine: pointsBySlice[0] = &val[x].cgo p_points = &pointsBySlice[0] case P2Affines: pointsBySlice[0] = &val[x].cgo p_points = &pointsBySlice[0] default: // type is already vetted } var p_scalars **C.byte switch val := scalarsIf.(type) { case []byte: scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) p_scalars = &scalarsBySlice[0] case [][]byte: p_scalars = &scalars[x] case []Scalar: if nbits > 248 { scalarsBySlice[0] = &val[x].cgo.b[0] p_scalars = &scalarsBySlice[0] } else { p_scalars = &scalars[x] } case []*Scalar: p_scalars = &scalars[x] default: // type is already vetted } C.blst_p2s_tile_pippenger(&grid[workItem].point.cgo, p_points, C.size_t(grid[workItem].dx), p_scalars, C.size_t(nbits), (*C.limb_t)(&scratch[0]), C.size_t(y), C.size_t(window)) if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { msgsCh <- y // "row" is done } else { runtime.Gosched() // be nice to the application } } pointsBySlice[0] = nil scalarsBySlice[0] = nil }() } var ret P2 rows := make([]bool, ny) row := 0 // actually index in |grid[]| for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" y := <-msgsCh rows[y/window] = true // mark the "row" for grid[row].y == y { // if it's current "row", process it for row < total && grid[row].y == y { C.blst_p2_add_or_double(&ret.cgo, &ret.cgo, &grid[row].point.cgo) row++ } if y == 0 { break // one can as well 'return &ret' here } for j := 0; j < window; j++ { C.blst_p2_double(&ret.cgo, &ret.cgo) } y -= window if !rows[y/window] { // see if next "row" was marked already break } } } for i := range scalars { scalars[i] = nil } return &ret } func (points P2Affines) Mult(scalarsIf interface{}, nbits int) *P2 { return P2AffinesMult(points, scalarsIf, nbits) } func (points P2s) Mult(scalarsIf interface{}, nbits int) *P2 { return points.ToAffine().Mult(scalarsIf, nbits) } // // Group-check // func P2AffinesValidate(pointsIf interface{}) bool { var npoints int switch val := pointsIf.(type) { case []*P2Affine: npoints = len(val) case []P2Affine: npoints = len(val) case P2Affines: npoints = len(val) default: panic(fmt.Sprintf("unsupported type %T", val)) } numThreads := numThreads(npoints) if numThreads < 2 { for i := 0; i < npoints; i++ { var point *P2Affine switch val := pointsIf.(type) { case []*P2Affine: point = val[i] case []P2Affine: point = &val[i] case P2Affines: point = &val[i] default: panic(fmt.Sprintf("unsupported type %T", val)) } if !C.go_p2_affine_validate(&point.cgo, true) { return false } } return true } valid := int32(1) curItem := uint32(0) var wg sync.WaitGroup wg.Add(numThreads) for tid := 0; tid < numThreads; tid++ { go func() { for atomic.LoadInt32(&valid) != 0 { work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(npoints) { break } var point *P2Affine switch val := pointsIf.(type) { case []*P2Affine: point = val[work] case []P2Affine: point = &val[work] case P2Affines: point = &val[work] default: panic(fmt.Sprintf("unsupported type %T", val)) } if !C.go_p2_affine_validate(&point.cgo, true) { atomic.StoreInt32(&valid, 0) break } } wg.Done() }() } wg.Wait() return atomic.LoadInt32(&valid) != 0 } func (points P2Affines) Validate() bool { return P2AffinesValidate(points) } // aug [][]byte - augmentation bytes for signing (default: nil) func parseOpts(optional ...interface{}) (augSingle []byte, aug [][]byte, useHash bool, ok bool) { useHash = true // hash (true), encode (false) for _, arg := range optional { switch v := arg.(type) { case []byte: augSingle = v case [][]byte: aug = v case bool: useHash = v default: return nil, nil, useHash, false } } return augSingle, aug, useHash, true } // These methods are inefficient because of cgo call overhead. For this // reason they should be used primarily for prototyping with a goal to // formulate interfaces that would process multiple scalars per cgo call. func (a *Scalar) MulAssign(b *Scalar) (*Scalar, bool) { return a, bool(C.blst_sk_mul_n_check(&a.cgo, &a.cgo, &b.cgo)) } func (a *Scalar) Mul(b *Scalar) (*Scalar, bool) { var ret Scalar return &ret, bool(C.blst_sk_mul_n_check(&ret.cgo, &a.cgo, &b.cgo)) } func (a *Scalar) AddAssign(b *Scalar) (*Scalar, bool) { return a, bool(C.blst_sk_add_n_check(&a.cgo, &a.cgo, &b.cgo)) } func (a *Scalar) Add(b *Scalar) (*Scalar, bool) { var ret Scalar return &ret, bool(C.blst_sk_add_n_check(&ret.cgo, &a.cgo, &b.cgo)) } func (a *Scalar) SubAssign(b *Scalar) (*Scalar, bool) { return a, bool(C.blst_sk_sub_n_check(&a.cgo, &a.cgo, &b.cgo)) } func (a *Scalar) Sub(b *Scalar) (*Scalar, bool) { var ret Scalar return &ret, bool(C.blst_sk_sub_n_check(&ret.cgo, &a.cgo, &b.cgo)) } func (a *Scalar) Inverse() *Scalar { var ret Scalar C.blst_sk_inverse(&ret.cgo, &a.cgo) return &ret } // // Serialization/Deserialization. // // Scalar serdes func (s *Scalar) Serialize() []byte { var out [BLST_SCALAR_BYTES]byte C.blst_bendian_from_scalar((*C.byte)(&out[0]), &s.cgo) return out[:] } func (s *Scalar) Deserialize(in []byte) *Scalar { if len(in) != BLST_SCALAR_BYTES || !C.go_scalar_from_bendian(&s.cgo, (*C.byte)(&in[0])) { return nil } return s } func (s *Scalar) Valid() bool { return bool(C.blst_sk_check(&s.cgo)) } func (s *Scalar) HashTo(msg []byte, dst []byte) bool { ret := HashToScalar(msg, dst) if ret != nil { *s = *ret return true } return false } func HashToScalar(msg []byte, dst []byte) *Scalar { var ret Scalar if C.go_hash_to_scalar(&ret.cgo, ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst))) { return &ret } return nil } // // LEndian // func (fr *Scalar) ToLEndian() []byte { var arr [BLST_SCALAR_BYTES]byte C.blst_lendian_from_scalar((*C.byte)(&arr[0]), &fr.cgo) return arr[:] } func (fp *Fp) ToLEndian() []byte { var arr [BLST_FP_BYTES]byte C.blst_lendian_from_fp((*C.byte)(&arr[0]), &fp.cgo) return arr[:] } func (fr *Scalar) FromLEndian(arr []byte) *Scalar { nbytes := len(arr) if nbytes < BLST_SCALAR_BYTES || !C.blst_scalar_from_le_bytes(&fr.cgo, (*C.byte)(&arr[0]), C.size_t(nbytes)) { return nil } return fr } func (fp *Fp) FromLEndian(arr []byte) *Fp { if len(arr) != BLST_FP_BYTES { return nil } C.blst_fp_from_lendian(&fp.cgo, (*C.byte)(&arr[0])) return fp } // // BEndian // func (fr *Scalar) ToBEndian() []byte { var arr [BLST_SCALAR_BYTES]byte C.blst_bendian_from_scalar((*C.byte)(&arr[0]), &fr.cgo) return arr[:] } func (fp *Fp) ToBEndian() []byte { var arr [BLST_FP_BYTES]byte C.blst_bendian_from_fp((*C.byte)(&arr[0]), &fp.cgo) return arr[:] } func (fr *Scalar) FromBEndian(arr []byte) *Scalar { nbytes := len(arr) if nbytes < BLST_SCALAR_BYTES || !C.blst_scalar_from_be_bytes(&fr.cgo, (*C.byte)(&arr[0]), C.size_t(nbytes)) { return nil } return fr } func (fp *Fp) FromBEndian(arr []byte) *Fp { if len(arr) != BLST_FP_BYTES { return nil } C.blst_fp_from_bendian(&fp.cgo, (*C.byte)(&arr[0])) return fp } // // Printing // func PrintBytes(val []byte, name string) { fmt.Printf("%s = %02x\n", name, val) } func (s *Scalar) Print(name string) { arr := s.ToBEndian() PrintBytes(arr, name) } func (p *P1Affine) Print(name string) { fmt.Printf("%s:\n", name) x := Fp{p.cgo.x} arr := x.ToBEndian() PrintBytes(arr, " x") y := Fp{p.cgo.y} arr = y.ToBEndian() PrintBytes(arr, " y") } func (p *P1) Print(name string) { fmt.Printf("%s:\n", name) aff := p.ToAffine() aff.Print(name) } func (f *Fp2) Print(name string) { fmt.Printf("%s:\n", name) var arr [BLST_FP_BYTES]byte C.blst_bendian_from_fp((*C.byte)(&arr[0]), &f.cgo.fp[0]) PrintBytes(arr[:], " 0") C.blst_bendian_from_fp((*C.byte)(&arr[0]), &f.cgo.fp[1]) PrintBytes(arr[:], " 1") } func (p *P2Affine) Print(name string) { fmt.Printf("%s:\n", name) x := Fp2{p.cgo.x} x.Print(" x") y := Fp2{p.cgo.y} y.Print(" y") } func (p *P2) Print(name string) { fmt.Printf("%s:\n", name) aff := p.ToAffine() aff.Print(name) } // // Equality // func (s1 *Scalar) Equals(s2 *Scalar) bool { return *s1 == *s2 } func (e1 *Fp) Equals(e2 *Fp) bool { return *e1 == *e2 } func (e1 *Fp2) Equals(e2 *Fp2) bool { return *e1 == *e2 } func (e1 *P1Affine) Equals(e2 *P1Affine) bool { return bool(C.blst_p1_affine_is_equal(&e1.cgo, &e2.cgo)) } func (pt *P1Affine) asPtr() *C.blst_p1_affine { if pt != nil { return &pt.cgo } return nil } func (e1 *P1) Equals(e2 *P1) bool { return bool(C.blst_p1_is_equal(&e1.cgo, &e2.cgo)) } func (e1 *P2Affine) Equals(e2 *P2Affine) bool { return bool(C.blst_p2_affine_is_equal(&e1.cgo, &e2.cgo)) } func (pt *P2Affine) asPtr() *C.blst_p2_affine { if pt != nil { return &pt.cgo } return nil } func (e1 *P2) Equals(e2 *P2) bool { return bool(C.blst_p2_is_equal(&e1.cgo, &e2.cgo)) } // private thunk for testing func expandMessageXmd(msg []byte, dst []byte, len_in_bytes int) []byte { ret := make([]byte, len_in_bytes) C.blst_expand_message_xmd((*C.byte)(&ret[0]), C.size_t(len(ret)), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst))) return ret } func breakdown(nbits, window, ncpus int) (nx int, ny int, wnd int) { if nbits > window*ncpus { //nolint:nestif nx = 1 wnd = bits.Len(uint(ncpus) / 4) if (window + wnd) > 18 { wnd = window - wnd } else { wnd = (nbits/window + ncpus - 1) / ncpus if (nbits/(window+1)+ncpus-1)/ncpus < wnd { wnd = window + 1 } else { wnd = window } } } else { nx = 2 wnd = window - 2 for (nbits/wnd+1)*nx < ncpus { nx += 1 wnd = window - bits.Len(3*uint(nx)/2) } nx -= 1 wnd = window - bits.Len(3*uint(nx)/2) } ny = nbits/wnd + 1 wnd = nbits/ny + 1 return nx, ny, wnd } func pippenger_window_size(npoints int) int { wbits := bits.Len(uint(npoints)) if wbits > 13 { return wbits - 4 } if wbits > 5 { return wbits - 3 } return 2 } ================================================ FILE: bindings/go/blst.tgo ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ package blst // #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // // no-asm 64-bit platforms from https://go.dev/doc/install/source // #cgo loong64 mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // // #include "blst.h" // // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) // # include // # include // static void handler(int signum) // { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " // "consult /bindings/go/README.md.\n", 70); // _exit(128+SIGILL); // (void)n; // } // __attribute__((constructor)) static void blst_cgo_init() // { blst_fp temp = { 0 }; // struct sigaction act = { handler }, oact; // sigaction(SIGILL, &act, &oact); // blst_fp_sqr(&temp, &temp); // sigaction(SIGILL, &oact, NULL); // } // #endif // // static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, // const byte *DST, size_t DST_len) // { if (DST != NULL) { // byte *dst = (byte*)new_ctx + blst_pairing_sizeof(); // for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i]; // DST = dst; // } // blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len); // } // static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx) // { *pt = *blst_pairing_as_fp12(ctx); } // // static void go_p1slice_to_affine(blst_p1_affine dst[], // const blst_p1 points[], size_t npoints) // { const blst_p1 *ppoints[2] = { points, NULL }; // blst_p1s_to_affine(dst, ppoints, npoints); // } // static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[], // size_t npoints) // { const blst_p1_affine *ppoints[2] = { points, NULL }; // blst_p1s_add(dst, ppoints, npoints); // } // static void go_p2slice_to_affine(blst_p2_affine dst[], // const blst_p2 points[], size_t npoints) // { const blst_p2 *ppoints[2] = { points, NULL }; // blst_p2s_to_affine(dst, ppoints, npoints); // } // static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[], // size_t npoints) // { const blst_p2_affine *ppoints[2] = { points, NULL }; // blst_p2s_add(dst, ppoints, npoints); // } // // static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine, // const byte *scalar, size_t nbits) // { blst_p1 m[1]; // const void *p = x; // if (p == NULL) // p = blst_p1_generator(); // else if (affine) // blst_p1_from_affine(m, p), p = m; // blst_p1_mult(m, p, scalar, nbits); // blst_p1_add_or_double(acc, acc, m); // } // static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine, // const byte *scalar, size_t nbits) // { blst_p2 m[1]; // const void *p = x; // if (p == NULL) // p = blst_p2_generator(); // else if (affine) // blst_p2_from_affine(m, p), p = m; // blst_p2_mult(m, p, scalar, nbits); // blst_p2_add_or_double(acc, acc, m); // } // // static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine) // { blst_p1 minus_b; // if (affine) // blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x); // else // minus_b = *(const blst_p1*)x; // blst_p1_cneg(&minus_b, 1); // blst_p1_add_or_double(a, a, &minus_b); // } // // static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine) // { blst_p2 minus_b; // if (affine) // blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x); // else // minus_b = *(const blst_p2*)x; // blst_p2_cneg(&minus_b, 1); // blst_p2_add_or_double(a, a, &minus_b); // } // // static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in) // { blst_scalar_from_bendian(ret, in); // return blst_sk_check(ret); // } // static bool go_hash_to_scalar(blst_scalar *ret, // const byte *msg, size_t msg_len, // const byte *DST, size_t DST_len) // { byte elem[48]; // blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); // return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); // } // static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[], // const blst_p1_affine P[], // size_t npoints, bool acc) // { const blst_p2_affine *Qs[2] = { Q, NULL }; // const blst_p1_affine *Ps[2] = { P, NULL }; // if (acc) { // blst_fp12 tmp; // blst_miller_loop_n(&tmp, Qs, Ps, npoints); // blst_fp12_mul(dst, dst, &tmp); // } else { // blst_miller_loop_n(dst, Qs, Ps, npoints); // } // } // static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n) // { size_t i; // blst_fp12_mul(dst, &in[0], &in[1]); // for (i = 2; i < n; i++) // blst_fp12_mul(dst, dst, &in[i]); // } // static bool go_p1_affine_validate(const blst_p1_affine *p, bool infcheck) // { if (infcheck && blst_p1_affine_is_inf(p)) // return 0; // return blst_p1_affine_in_g1(p); // } // static bool go_p2_affine_validate(const blst_p2_affine *p, bool infcheck) // { if (infcheck && blst_p2_affine_is_inf(p)) // return 0; // return blst_p2_affine_in_g2(p); // } import "C" import "runtime" const BLST_SCALAR_BYTES = 256 / 8 const BLST_FP_BYTES = 384 / 8 const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 type Scalar struct{ cgo C.blst_scalar } type Fp struct{ cgo C.blst_fp } type Fp2 struct{ cgo C.blst_fp2 } type Fp6 = C.blst_fp6 type Fp12 struct{ cgo C.blst_fp12 } type P1 struct{ cgo C.blst_p1 } type P2 struct{ cgo C.blst_p2 } type P1Affine struct{ cgo C.blst_p1_affine } type P2Affine struct{ cgo C.blst_p2_affine } type Message = []byte type Pairing = []C.blst_pairing type SecretKey = Scalar type P1s []P1 type P2s []P2 type P1Affines []P1Affine type P2Affines []P2Affine // // Configuration // var maxProcs = initMaxProcs() func initMaxProcs() int { maxProcs := runtime.GOMAXPROCS(0) var version float32 _, err := fmt.Sscanf(runtime.Version(), "go%f", &version) if err != nil || version < 1.14 { // be cooperative and leave one processor for the application maxProcs -= 1 } if maxProcs <= 0 { maxProcs = 1 } return maxProcs } func SetMaxProcs(procs int) { if procs <= 0 { procs = 1 } maxProcs = procs } func numThreads(maxThreads int) int { numThreads := maxProcs // take into consideration the possility that application reduced // GOMAXPROCS after |maxProcs| was initialized numProcs := runtime.GOMAXPROCS(0) if maxProcs > numProcs { numThreads = numProcs } if maxThreads > 0 && numThreads > maxThreads { return maxThreads } return numThreads } var cgo_pairingSizeOf = C.blst_pairing_sizeof() var cgo_p1Generator = P1{*C.blst_p1_generator()} var cgo_p2Generator = P2{*C.blst_p2_generator()} var cgo_fp12One = Fp12{*C.blst_fp12_one()} // // Secret key // func (sk *SecretKey) Zeroize() { var zero SecretKey *sk = zero } func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } if len(ikm) < 32 { return nil } C.blst_keygen(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } C.blst_keygen_v3(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } C.blst_keygen_v4_5(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), (*C.byte)(&salt[0]), C.size_t(len(salt)), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey var info []byte if len(optional) > 0 { info = optional[0] } saltLen := len(salt) if saltLen == 0 { salt = []byte{0} } C.blst_keygen_v5(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), (*C.byte)(&salt[0]), C.size_t(saltLen), ptrOrNil(info), C.size_t(len(info))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func DeriveMasterEip2333(ikm []byte) *SecretKey { if len(ikm) < 32 { return nil } var sk SecretKey C.blst_derive_master_eip2333(&sk.cgo, (*C.byte)(&ikm[0]), C.size_t(len(ikm))) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey { var sk SecretKey C.blst_derive_child_eip2333(&sk.cgo, &master.cgo, C.uint(child_index)) // Postponing secret key zeroing till garbage collection can be too // late to be effective, but every little bit helps... runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) return &sk } // // Pairing // func pairingSizeOf(DST_len C.size_t) int { return int((cgo_pairingSizeOf + DST_len + 7) / 8) } func PairingCtx(hash_or_encode bool, DST []byte) Pairing { DST_len := C.size_t(len(DST)) ctx := make([]C.blst_pairing, pairingSizeOf(DST_len)) C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), ptrOrNil(DST), DST_len) return ctx } func PairingCommit(ctx Pairing) { C.blst_pairing_commit(&ctx[0]) } func PairingMerge(ctx Pairing, ctx1 Pairing) int { r := C.blst_pairing_merge(&ctx[0], &ctx1[0]) return int(r) } func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { var gtsig *Fp12 if len(optional) > 0 { gtsig = optional[0] } return bool(C.blst_pairing_finalverify(&ctx[0], gtsig.asPtr())) } func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) { C.blst_pairing_raw_aggregate(&ctx[0], &q.cgo, &p.cgo) } func PairingAsFp12(ctx Pairing) *Fp12 { var pt Fp12 C.go_pairing_as_fp12(&pt.cgo, &ctx[0]) return &pt } func Fp12One() Fp12 { return cgo_fp12One } func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool { return bool(C.blst_fp12_finalverify(&pt1.cgo, &pt2.cgo)) } func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { var pt Fp12 C.blst_miller_loop(&pt.cgo, &q.cgo, &p.cgo) return &pt } func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 { if len(qs) != len(ps) || len(qs) == 0 { panic("inputs' lengths mismatch") } nElems := uint32(len(qs)) nThreads := uint32(maxProcs) if nThreads == 1 || nElems == 1 { var pt Fp12 C.go_miller_loop_n(&pt.cgo, &qs[0].cgo, &ps[0].cgo, C.size_t(nElems), false) return &pt } stride := (nElems + nThreads - 1) / nThreads if stride > 16 { stride = 16 } strides := (nElems + stride - 1) / stride if nThreads > strides { nThreads = strides } msgsCh := make(chan Fp12, nThreads) curElem := uint32(0) for tid := uint32(0); tid < nThreads; tid++ { go func() { acc := Fp12One() first := true for { work := atomic.AddUint32(&curElem, stride) - stride if work >= nElems { break } n := nElems - work if n > stride { n = stride } C.go_miller_loop_n(&acc.cgo, &qs[work].cgo, &ps[work].cgo, C.size_t(n), C.bool(!first)) first = false } msgsCh <- acc }() } var ret = make([]Fp12, nThreads); for i := range(ret) { ret[i] = <- msgsCh } var pt Fp12 C.go_fp12slice_mul(&pt.cgo, &ret[0].cgo, C.size_t(nThreads)) return &pt } func (pt *Fp12) MulAssign(p *Fp12) { C.blst_fp12_mul(&pt.cgo, &pt.cgo, &p.cgo) } func (pt *Fp12) FinalExp() { C.blst_final_exp(&pt.cgo, &pt.cgo) } func (pt *Fp12) InGroup() bool { return bool(C.blst_fp12_in_group(&pt.cgo)) } func (pt *Fp12) ToBendian() []byte { var out [BLST_FP_BYTES*12]byte C.blst_bendian_from_fp12((*C.byte)(&out[0]), &pt.cgo) return out[:] } func (pt1 *Fp12) Equals(pt2 *Fp12) bool { return *pt1 == *pt2 } func (pt *Fp12) asPtr() *C.blst_fp12 { if (pt != nil) { return &pt.cgo } return nil } func ptrOrNil(bytes []byte) *C.byte { var ptr *C.byte if len(bytes) > 0 { ptr = (*C.byte)(&bytes[0]) } return ptr } ================================================ FILE: bindings/go/blst_htoc_test.go ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ package blst import ( "bytes" "encoding/hex" "encoding/json" "fmt" "os" "strconv" "strings" "testing" ) func decodeP1(m map[string]interface{}) *P1Affine { x, err := hex.DecodeString(m["x"].(string)[2:]) if err != nil { fmt.Println(err) return nil } y, err := hex.DecodeString(m["y"].(string)[2:]) if err != nil { fmt.Println(err) return nil } var p1 P1Affine p1.Deserialize(append(x, y...)) return &p1 } func readAll(file *os.File) ([]byte, error) { defer file.Close() stat, err := file.Stat() if err != nil { return nil, err //nolint:wrapcheck } buf := make([]byte, stat.Size()) total := 0 for total < len(buf) { read, err := file.Read(buf[total:]) if err != nil { return nil, err //nolint:wrapcheck } total += read } return buf, nil } func jsonG1HashToCurve(t *testing.T, fname string) { t.Helper() vfile, err := os.Open(fname) if err != nil { t.Skipf("%.16s... not found", fname) } buf, err := readAll(vfile) if err != nil { t.Error(err.Error()) } var vectors map[string]interface{} err = json.Unmarshal(buf, &vectors) if err != nil { t.Error(err.Error()) } dst := []byte(vectors["dst"].(string)) hash_or_encode := vectors["randomOracle"].(bool) vectorsArr, ok := vectors["vectors"].([]interface{}) if !ok { t.Error("Could not cast vectors to an array") } for _, v := range vectorsArr { testMap, ok := v.(map[string]interface{}) if !ok { t.Error("Could not cast vector to map") } msg := []byte(testMap["msg"].(string)) p1Expected := decodeP1(testMap["P"].(map[string]interface{})) var p1Hashed *P1Affine if hash_or_encode { p1Hashed = HashToG1(msg, dst).ToAffine() } else { p1Hashed = EncodeToG1(msg, dst).ToAffine() } if !p1Hashed.Equals(p1Expected) { t.Error("hashed != expected") } } } func TestG1HashToCurve(t *testing.T) { t.Parallel() jsonG1HashToCurve(t, "../vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_RO_.json") jsonG1HashToCurve(t, "../vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_NU_.json") } func decodeP2(m map[string]interface{}) *P2Affine { xArr := strings.Split(m["x"].(string), ",") x0, err := hex.DecodeString(xArr[0][2:]) if err != nil { fmt.Println(err) return nil } x1, err := hex.DecodeString(xArr[1][2:]) if err != nil { fmt.Println(err) return nil } yArr := strings.Split(m["y"].(string), ",") y0, err := hex.DecodeString(yArr[0][2:]) if err != nil { fmt.Println(err) return nil } y1, err := hex.DecodeString(yArr[1][2:]) if err != nil { fmt.Println(err) return nil } var p2 P2Affine p2.Deserialize(append(x1, append(x0, append(y1, y0...)...)...)) return &p2 } func jsonG2HashToCurve(t *testing.T, fname string) { t.Helper() vfile, err := os.Open(fname) if err != nil { t.Skipf("%.16s... not found", fname) } buf, err := readAll(vfile) if err != nil { t.Error(err.Error()) } var vectors map[string]interface{} err = json.Unmarshal(buf, &vectors) if err != nil { t.Error(err.Error()) } dst := []byte(vectors["dst"].(string)) hash_or_encode := vectors["randomOracle"].(bool) vectorsArr, ok := vectors["vectors"].([]interface{}) if !ok { t.Error("Could not cast vectors to an array") } for _, v := range vectorsArr { testMap, ok := v.(map[string]interface{}) if !ok { t.Error("Could not cast vector to map") } msg := []byte(testMap["msg"].(string)) p2Expected := decodeP2(testMap["P"].(map[string]interface{})) var p2Hashed *P2Affine if hash_or_encode { p2Hashed = HashToG2(msg, dst).ToAffine() } else { p2Hashed = EncodeToG2(msg, dst).ToAffine() } if !p2Hashed.Equals(p2Expected) { t.Error("hashed != expected") } } } func TestG2HashToCurve(t *testing.T) { t.Parallel() jsonG2HashToCurve(t, "../vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_RO_.json") jsonG2HashToCurve(t, "../vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_NU_.json") } func jsonExpandMessageXmd(t *testing.T, fname string) { t.Helper() vfile, err := os.Open(fname) if err != nil { t.Skipf("%.16s... not found", fname) } buf, err := readAll(vfile) if err != nil { t.Error(err.Error()) } var vectors map[string]interface{} err = json.Unmarshal(buf, &vectors) if err != nil { t.Error(err.Error()) } DST := []byte(vectors["DST"].(string)) tests, ok := vectors["tests"].([]interface{}) if !ok { t.Error("Could not cast 'tests' to an array") } for _, v := range tests { test, ok := v.(map[string]interface{}) if !ok { t.Error("Could not map 'tests[]' element") } len_in_bytes, err := strconv.ParseInt(test["len_in_bytes"].(string), 0, 0) if err != nil { t.Error(err.Error()) } msg := []byte(test["msg"].(string)) expected, err := hex.DecodeString(test["uniform_bytes"].(string)) if err != nil { t.Error(err.Error()) } hashed := expandMessageXmd(msg, DST, int(len_in_bytes)) if !bytes.Equal(hashed, expected) { t.Error("hashed != expected") } } } func TestExpandMessageXmd(t *testing.T) { t.Parallel() jsonExpandMessageXmd(t, "../vectors/hash_to_curve/expand_message_xmd_SHA256_256.json") jsonExpandMessageXmd(t, "../vectors/hash_to_curve/expand_message_xmd_SHA256_38.json") } ================================================ FILE: bindings/go/blst_miller_loop_test.go ================================================ package blst import ( "crypto/rand" "testing" ) func TestMillerLoopN(t *testing.T) { t.Parallel() const npoints = 97 scalars := make([]byte, npoints*8) _, err := rand.Read(scalars) if err != nil { t.Error(err.Error()) return } p1s := make([]P1, npoints) p2s := make([]P2, npoints) g1 := P1Generator() g2 := P2Generator() for i := range p1s { p1s[i] = *g1.Mult(scalars[i*8:i*8+4], 32) p2s[i] = *g2.Mult(scalars[i*8+4:i*8+8], 32) } ps := P1s(p1s).ToAffine() qs := P2s(p2s).ToAffine() naive := Fp12One() for i := range p1s { naive.MulAssign(Fp12MillerLoop(&qs[i], &ps[i])) } if !naive.Equals(Fp12MillerLoopN(qs, ps)) { t.Error("failed self-consistency Fp12MillerLoopN test") } } ================================================ FILE: bindings/go/blst_minpk.tgo ================================================ import ( "runtime" "sync" "sync/atomic" ) // // PublicKey // func (pk *P1Affine) From(s *Scalar) *P1Affine { C.blst_sk_to_pk2_in_g1(nil, &pk.cgo, &s.cgo) return pk } func (pk *P1Affine) KeyValidate() bool { return bool(C.go_p1_affine_validate(&pk.cgo, true)) } // sigInfcheck, check for infinity, is a way to avoid going // into resource-consuming verification. Passing 'false' is // always cryptographically safe, but application might want // to guard against obviously bogus individual[!] signatures. func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { return bool(C.go_p2_affine_validate(&sig.cgo, C.bool(sigInfcheck))) } // // Sign // func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, optional ...interface{}) *P2Affine { augSingle, aug, useHash, ok := parseOpts(optional...) if !ok || len(aug) != 0 { return nil } var q *P2 if useHash { q = HashToG2(msg, dst, augSingle) } else { q = EncodeToG2(msg, dst, augSingle) } C.blst_sign_pk2_in_g1(nil, &sig.cgo, &q.cgo, &sk.cgo) return sig } // // Signature // // Functions to return a signature and public key+augmentation tuple. // This enables point decompression (if needed) to happen in parallel. type sigGetterP2 func() *P2Affine type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) // Single verify with decompressed pk func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool, msg Message, dst []byte, optional ...interface{}) bool { // useHash bool, aug []byte aug, _, useHash, ok := parseOpts(optional...) if !ok { return false } return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate, []Message{msg}, dst, useHash, [][]byte{aug}) } // Single verify with compressed pk // Uses a dummy signature to get the correct type func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, pk []byte, pkValidate bool, msg Message, dst []byte, optional ...bool) bool { // useHash bool, usePksAsAugs bool return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, [][]byte{pk}, pkValidate, []Message{msg}, dst, optional...) } // Aggregate verify with uncompressed signature and public keys // Note that checking message uniqueness, if required, is left to the user. // Not all signature schemes require it and this keeps the binding minimal // and fast. Refer to the Uniq function for one method method of performing // this check. func (sig *P2Affine) AggregateVerify(sigGroupcheck bool, pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, optional ...interface{}) bool { // useHash bool, augs [][]byte // sanity checks and argument parsing n := len(pks) if n == 0 || len(msgs) != n { return false } _, augs, useHash, ok := parseOpts(optional...) useAugs := len(augs) != 0 if !ok || (useAugs && len(augs) != n) { return false } sigFn := func() *P2Affine { return sig } pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { if useAugs { return pks[i], augs[i] } return pks[i], nil } return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, msgs, dst, useHash) } // Aggregate verify with compressed signature and public keys // Uses a dummy signature to get the correct type func (*P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, pks [][]byte, pksVerify bool, msgs []Message, dst []byte, optional ...bool) bool { // useHash bool, usePksAsAugs bool // sanity checks and argument parsing if len(pks) != len(msgs) { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } usePksAsAugs := false if len(optional) > 1 { usePksAsAugs = optional[1] } sigFn := func() *P2Affine { sigP := new(P2Affine) if sigP.Uncompress(sig) == nil { return nil } return sigP } pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { bytes := pks[i] if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0] & 0x80) == 0 { // Not compressed if pk.Deserialize(bytes) == nil { return nil, nil } } else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0] & 0x80) != 0 { if pk.Uncompress(bytes) == nil { return nil, nil } } else { return nil, nil } if usePksAsAugs { return pk, bytes } return pk, nil } return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, msgs, dst, useHash) } func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte, optional ...bool) bool { // useHash n := len(msgs) if n == 0 { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } numCores := runtime.GOMAXPROCS(0) numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding pk,msg[,aug] tuple and // repeat until n is exceeded. The resulting accumulations will be // fed into the msgsCh channel. msgsCh := make(chan Pairing, numThreads) valid := int32(1) curItem := uint32(0) mutex := sync.Mutex{} mutex.Lock() for tid := 0; tid < numThreads; tid++ { go func() { pairing := PairingCtx(useHash, dst) var temp P1Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } else if work == 0 && maxProcs == numCores-1 && numThreads == maxProcs { // Avoid consuming all cores by waiting until the // main thread has completed its miller loop before // proceeding. mutex.Lock() mutex.Unlock() //nolint:staticcheck } // Pull Public Key and augmentation blob curPk, aug := pkFn(work, &temp) if curPk == nil { atomic.StoreInt32(&valid, 0) break } // Pairing and accumulate ret := PairingAggregatePkInG1(pairing, curPk, pkValidate, nil, false, msgs[work], aug) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) break } // application might have some async work to do runtime.Gosched() } if atomic.LoadInt32(&valid) > 0 { PairingCommit(pairing) msgsCh <- pairing } else { msgsCh <- nil } }() } // Uncompress and check signature var gtsig Fp12 sig := sigFn() if sig == nil { atomic.StoreInt32(&valid, 0) } if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && !sig.SigValidate(false) { atomic.StoreInt32(&valid, 0) } if atomic.LoadInt32(&valid) > 0 { C.blst_aggregated_in_g2(>sig.cgo, &sig.cgo) } mutex.Unlock() // Accumulate the thread results var pairings Pairing for i := 0; i < numThreads; i++ { msg := <-msgsCh if msg != nil { if pairings == nil { pairings = msg } else { ret := PairingMerge(pairings, msg) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) } } } } if atomic.LoadInt32(&valid) == 0 || pairings == nil { return false } return PairingFinalVerify(pairings, >sig) } func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool, msg Message, dst []byte, optional ...[]byte) int { var aug []byte if len(optional) > 0 { aug = optional[0] } if runtime.NumGoroutine() < maxProcs { sigFn := func() *P2Affine { return sig } pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) { return pk, aug } if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg}, dst, hash_or_encode) { return C.BLST_VERIFY_FAIL } return C.BLST_SUCCESS } return int(C.blst_core_verify_pk_in_g1(&pk.cgo, &sig.cgo, C.bool(hash_or_encode), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug)))) } // pks are assumed to be verified for proof of possession, // which implies that they are already group-checked func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool, pks []*P1Affine, msg Message, dst []byte, optional ...interface{}) bool { // pass-through to Verify n := len(pks) // TODO: return value for length zero? if n == 0 { return false } aggregator := new(P1Aggregate) if !aggregator.Aggregate(pks, false) { return false } pkAff := aggregator.ToAffine() // Verify return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) } func (*P2Affine) MultipleAggregateVerify(sigs []*P2Affine, sigsGroupcheck bool, pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, randFn func(*Scalar), randBits int, optional ...interface{}) bool { // useHash // Sanity checks and argument parsing n := len(pks) if n == 0 || len(msgs) != n || len(sigs) != n { return false } _, augs, useHash, ok := parseOpts(optional...) useAugs := len(augs) != 0 if !ok || (useAugs && len(augs) != n) { return false } paramsFn := func(work uint32, _ *P2Affine, _ *P1Affine, rand *Scalar) ( *P2Affine, *P1Affine, *Scalar, []byte) { randFn(rand) var aug []byte if useAugs { aug = augs[work] } return sigs[work], pks[work], rand, aug } return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify, msgs, dst, randBits, useHash) } type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine, rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte) func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1, sigsGroupcheck bool, pksVerify bool, msgs []Message, dst []byte, randBits int, optional ...bool) bool { // useHash n := len(msgs) if n == 0 { return false } useHash := true if len(optional) > 0 { useHash = optional[0] } numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding pk,msg[,aug] tuple and // repeat until n is exceeded. The resulting accumulations will be // fed into the msgsCh channel. msgsCh := make(chan Pairing, numThreads) valid := int32(1) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { pairing := PairingCtx(useHash, dst) var tempRand Scalar var tempPk P1Affine var tempSig P2Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } curSig, curPk, curRand, aug := paramsFn(work, &tempSig, &tempPk, &tempRand) if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify, curSig, sigsGroupcheck, curRand, randBits, msgs[work], aug) != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) break } // application might have some async work to do runtime.Gosched() } if atomic.LoadInt32(&valid) > 0 { PairingCommit(pairing) msgsCh <- pairing } else { msgsCh <- nil } }() } // Accumulate the thread results var pairings Pairing for i := 0; i < numThreads; i++ { msg := <-msgsCh if msg != nil { if pairings == nil { pairings = msg } else { ret := PairingMerge(pairings, msg) if ret != C.BLST_SUCCESS { atomic.StoreInt32(&valid, 0) } } } } if atomic.LoadInt32(&valid) == 0 || pairings == nil { return false } return PairingFinalVerify(pairings, nil) } // // Aggregate P2 // type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine type P2Aggregate struct { v *P2 } // Aggregate uncompressed elements func (agg *P2Aggregate) Aggregate(elmts []*P2Affine, groupcheck bool) bool { if len(elmts) == 0 { return true } getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } return agg.coreAggregate(getter, groupcheck, len(elmts)) } func (agg *P2Aggregate) AggregateWithRandomness(pointsIf interface{}, scalarsIf interface{}, nbits int, groupcheck bool) bool { if groupcheck && !P2AffinesValidate(pointsIf) { return false } agg.v = P2AffinesMult(pointsIf, scalarsIf, nbits) return true } // Aggregate compressed elements func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte, groupcheck bool) bool { if len(elmts) == 0 { return true } getter := func(i uint32, p *P2Affine) *P2Affine { bytes := elmts[i] if p.Uncompress(bytes) == nil { return nil } return p } return agg.coreAggregate(getter, groupcheck, len(elmts)) } func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) { if other.v == nil { // do nothing } else if agg.v == nil { agg.v = other.v } else { C.blst_p2_add_or_double(&agg.v.cgo, &agg.v.cgo, &other.v.cgo) } } func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool { if groupcheck && !bool(C.blst_p2_affine_in_g2(&elmt.cgo)) { return false } if agg.v == nil { agg.v = new(P2) C.blst_p2_from_affine(&agg.v.cgo, &elmt.cgo) } else { C.blst_p2_add_or_double_affine(&agg.v.cgo, &agg.v.cgo, &elmt.cgo) } return true } func (agg *P2Aggregate) ToAffine() *P2Affine { if agg.v == nil { return new(P2Affine) } return agg.v.ToAffine() } func (agg *P2Aggregate) coreAggregate(getter aggGetterP2, groupcheck bool, n int) bool { if n == 0 { return true } // operations are considered short enough for not to care about // keeping one core free... numThreads := runtime.GOMAXPROCS(0) if numThreads > n { numThreads = n } valid := int32(1) type result struct { agg *P2 empty bool } msgs := make(chan result, numThreads) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { first := true var agg P2 var temp P2Affine for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } // Signature validate curElmt := getter(work, &temp) if curElmt == nil { atomic.StoreInt32(&valid, 0) break } if groupcheck && !bool(C.blst_p2_affine_in_g2(&curElmt.cgo)) { atomic.StoreInt32(&valid, 0) break } if first { C.blst_p2_from_affine(&agg.cgo, &curElmt.cgo) first = false } else { C.blst_p2_add_or_double_affine(&agg.cgo, &agg.cgo, &curElmt.cgo) } // application might have some async work to do runtime.Gosched() } if first { msgs <- result{nil, true} } else if atomic.LoadInt32(&valid) > 0 { msgs <- result{&agg, false} } else { msgs <- result{nil, false} } }() } // Accumulate the thread results first := agg.v == nil validLocal := true for i := 0; i < numThreads; i++ { msg := <-msgs if !validLocal || msg.empty { // do nothing } else if msg.agg == nil { validLocal = false // This should be unnecessary but seems safer atomic.StoreInt32(&valid, 0) } else { if first { agg.v = msg.agg first = false } else { C.blst_p2_add_or_double(&agg.v.cgo, &agg.v.cgo, &msg.agg.cgo) } } } if atomic.LoadInt32(&valid) == 0 { agg.v = nil return false } return true } ================================================ FILE: bindings/go/blst_minpk_test.go ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ package blst import ( "crypto/rand" "fmt" "runtime" "testing" ) // Min PK. type PublicKeyMinPk = P1Affine type SignatureMinPk = P2Affine type AggregateSignatureMinPk = P2Aggregate type AggregatePublicKeyMinPk = P1Aggregate // Names in this file must be unique to support min-sig so we can't use 'dst' // here. var dstMinPk = []byte("BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_") func init() { // Use all cores when testing and benchmarking SetMaxProcs(runtime.GOMAXPROCS(0)) } func TestInfinityMinPk(t *testing.T) { t.Parallel() var infComp [BLST_P1_COMPRESS_BYTES]byte infComp[0] |= 0xc0 new(PublicKeyMinPk).Uncompress(infComp[:]) } func TestSerdesMinPk(t *testing.T) { t.Parallel() var ikm = [...]byte{ 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} sk := KeyGen(ikm[:]) defer sk.Zeroize() // Serialize/deserialize sk sk2 := new(SecretKey).Deserialize(sk.Serialize()) defer sk2.Zeroize() if !sk.Equals(sk2) { t.Error("sk2 != sk") } // Negative test equals sk.cgo.b[0]++ if sk.Equals(sk2) { t.Error("sk2 == sk") } // pk pk := new(PublicKeyMinPk).From(sk) // Compress/decompress sk pk2 := new(PublicKeyMinPk).Uncompress(pk.Compress()) if !pk.Equals(pk2) { t.Error("pk2 != pk") } // Serialize/deserialize sk pk3 := new(PublicKeyMinPk).Deserialize(pk.Serialize()) if !pk.Equals(pk3) { t.Error("pk3 != pk") } // Negative test equals // pk.x.l[0] = pk.x.l[0] + 1 // if pk.Equals(pk2) { // t.Error("pk2 == pk") // } } func TestSignVerifyMinPk(t *testing.T) { t.Parallel() var ikm = [...]byte{ 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} sk0 := KeyGen(ikm[:]) ikm[0]++ sk1 := KeyGen(ikm[:]) // pk pk0 := new(PublicKeyMinPk).From(sk0) pk1 := new(PublicKeyMinPk).From(sk1) // Sign msg0 := []byte("hello foo") msg1 := []byte("hello bar!") sig0 := new(SignatureMinPk).Sign(sk0, msg0, dstMinPk) sig1 := new(SignatureMinPk).Sign(sk1, msg1, dstMinPk) // Verify if !sig0.Verify(true, pk0, false, msg0, dstMinPk) { t.Error("verify sig0") } if !sig1.Verify(true, pk1, false, msg1, dstMinPk) { t.Error("verify sig1") } if !new(SignatureMinPk).VerifyCompressed(sig1.Compress(), true, pk1.Compress(), false, msg1, dstMinPk) { t.Error("verify sig1") } // Batch verify if !sig0.AggregateVerify(true, []*PublicKeyMinPk{pk0}, false, []Message{msg0}, dstMinPk) { t.Error("aggregate verify sig0") } // Verify compressed inputs if !new(SignatureMinPk).AggregateVerifyCompressed(sig0.Compress(), true, [][]byte{pk0.Compress()}, false, []Message{msg0}, dstMinPk) { t.Error("aggregate verify sig0 compressed") } // Verify serialized inputs if !new(SignatureMinPk).AggregateVerifyCompressed(sig0.Compress(), true, [][]byte{pk0.Serialize()}, false, []Message{msg0}, dstMinPk) { t.Error("aggregate verify sig0 serialized") } // Compressed with empty pk var emptyPk []byte if new(SignatureMinPk).VerifyCompressed(sig0.Compress(), true, emptyPk, false, msg0, dstMinPk) { t.Error("verify sig compressed inputs") } // Wrong message if sig0.Verify(true, pk0, false, msg1, dstMinPk) { t.Error("Expected Verify to return false") } // Wrong key if sig0.Verify(true, pk1, false, msg0, dstMinPk) { t.Error("Expected Verify to return false") } // Wrong sig if sig1.Verify(true, pk0, false, msg0, dstMinPk) { t.Error("Expected Verify to return false") } } func TestSignVerifyAugMinPk(t *testing.T) { t.Parallel() sk := genRandomKeyMinPk() pk := new(PublicKeyMinPk).From(sk) msg := []byte("hello foo") aug := []byte("augmentation") sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk, aug) if !sig.Verify(true, pk, false, msg, dstMinPk, aug) { t.Error("verify sig") } aug2 := []byte("augmentation2") if sig.Verify(true, pk, false, msg, dstMinPk, aug2) { t.Error("verify sig, wrong augmentation") } if sig.Verify(true, pk, false, msg, dstMinPk) { t.Error("verify sig, no augmentation") } // TODO: augmentation with aggregate verify } func TestSignVerifyEncodeMinPk(t *testing.T) { t.Parallel() sk := genRandomKeyMinPk() pk := new(PublicKeyMinPk).From(sk) msg := []byte("hello foo") sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk, false) if !sig.Verify(true, pk, false, msg, dstMinPk, false) { t.Error("verify sig") } if sig.Verify(true, pk, false, msg, dstMinPk) { t.Error("verify sig expected fail, wrong hashing engine") } if sig.Verify(true, pk, false, msg, dstMinPk, 0) { t.Error("verify sig expected fail, illegal argument") } } func TestSignVerifyAggregateMinPk(t *testing.T) { t.Parallel() for size := 1; size < 20; size++ { sks, msgs, _, pubks, _, err := generateBatchTestDataUncompressedMinPk(size) if err { t.Error("Error generating test data") return } // All signers sign the same message sigs := make([]*SignatureMinPk, 0) for i := 0; i < size; i++ { sigs = append(sigs, new(SignatureMinPk).Sign(sks[i], msgs[0], dstMinPk)) } agProj := new(AggregateSignatureMinPk) if !agProj.Aggregate(sigs, false) { t.Error("Aggregate unexpectedly returned nil") return } agSig := agProj.ToAffine() if !agSig.FastAggregateVerify(false, pubks, msgs[0], dstMinPk) { t.Errorf("failed to verify size %d", size) } // Negative test if agSig.FastAggregateVerify(false, pubks, msgs[0][1:], dstMinPk) { t.Errorf("failed to not verify size %d", size) } // Test compressed signature aggregation compSigs := make([][]byte, size) for i := 0; i < size; i++ { compSigs[i] = sigs[i].Compress() } agProj = new(AggregateSignatureMinPk) if !agProj.AggregateCompressed(compSigs, false) { t.Error("AggregateCompressed unexpectedly returned nil") return } agSig = agProj.ToAffine() if !agSig.FastAggregateVerify(false, pubks, msgs[0], dstMinPk) { t.Errorf("failed to verify size %d", size) } // Negative test if agSig.FastAggregateVerify(false, pubks, msgs[0][1:], dstMinPk) { t.Errorf("failed to not verify size %d", size) } } } func TestSignMultipleVerifyAggregateMinPk(t *testing.T) { t.Parallel() msgCount := 5 for size := 1; size < 20; size++ { msgs := make([]Message, 0) sks := make([]*SecretKey, 0) pks := make([]*PublicKeyMinPk, 0) // Generate messages for i := 0; i < msgCount; i++ { msg := Message(fmt.Sprintf("blst is a blast!! %d %d", i, size)) msgs = append(msgs, msg) } // Generate keypairs for i := 0; i < size; i++ { priv := genRandomKeyMinPk() sks = append(sks, priv) pks = append(pks, new(PublicKeyMinPk).From(priv)) } // All signers sign each message aggSigs := make([]*SignatureMinPk, 0) aggPks := make([]*PublicKeyMinPk, 0) for i := 0; i < msgCount; i++ { sigsToAgg := make([]*SignatureMinPk, 0) pksToAgg := make([]*PublicKeyMinPk, 0) for j := 0; j < size; j++ { sigsToAgg = append(sigsToAgg, new(SignatureMinPk).Sign(sks[j], msgs[i], dstMinPk)) pksToAgg = append(pksToAgg, pks[j]) } agSig := new(AggregateSignatureMinPk) if !agSig.Aggregate(sigsToAgg, true) { t.Error("failed to aggregate") } afSig := agSig.ToAffine() agPk := new(AggregatePublicKeyMinPk) agPk.Aggregate(pksToAgg, false) afPk := agPk.ToAffine() aggSigs = append(aggSigs, afSig) aggPks = append(aggPks, afPk) // Verify aggregated signature and pk if !afSig.Verify(false, afPk, false, msgs[i], dstMinPk) { t.Errorf("failed to verify single aggregate size %d", size) } } randFn := func(s *Scalar) { var rbytes [BLST_SCALAR_BYTES]byte _, err := rand.Read(rbytes[:]) if err != nil { t.Error(err.Error()) } s.FromBEndian(rbytes[:]) } // Verify randBits := 64 if !new(SignatureMinPk).MultipleAggregateVerify(aggSigs, true, aggPks, false, msgs, dstMinPk, randFn, randBits) { t.Errorf("failed to verify multiple aggregate size %d", size) } // Negative test if new(SignatureMinPk).MultipleAggregateVerify(aggSigs, true, aggPks, false, msgs, dstMinPk[1:], randFn, randBits) { t.Errorf("failed to not verify multiple aggregate size %d", size) } } } func TestBatchUncompressMinPk(t *testing.T) { t.Parallel() size := 128 var points []*P2Affine var compPoints [][]byte for i := 0; i < size; i++ { msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) p2 := HashToG2(msg, dstMinPk).ToAffine() points = append(points, p2) compPoints = append(compPoints, p2.Compress()) } uncompPoints := new(SignatureMinPk).BatchUncompress(compPoints) if uncompPoints == nil { t.Errorf("BatchUncompress returned nil size %d", size) } for i := 0; i < size; i++ { if !points[i].Equals(uncompPoints[i]) { t.Errorf("Uncompressed point does not equal initial point %d", i) } } } func BenchmarkCoreSignMinPk(b *testing.B) { var ikm = [...]byte{ 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} sk := KeyGen(ikm[:]) defer sk.Zeroize() msg := []byte("hello foo") for i := 0; i < b.N; i++ { new(SignatureMinPk).Sign(sk, msg, dstMinPk) } } func BenchmarkCoreVerifyMinPk(b *testing.B) { var ikm = [...]byte{ 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99} sk := KeyGen(ikm[:]) defer sk.Zeroize() pk := new(PublicKeyMinPk).From(sk) msg := []byte("hello foo") sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk) // Verify for i := 0; i < b.N; i++ { if !sig.Verify(true, pk, false, msg, dstMinPk) { b.Fatal("verify sig") } } } func BenchmarkCoreVerifyAggregateMinPk(b *testing.B) { run := func(size int) func(b *testing.B) { return func(b *testing.B) { b.Helper() msgs, _, pubks, agsig, err := generateBatchTestDataMinPk(size) if err { b.Fatal("Error generating test data") } b.ResetTimer() for i := 0; i < b.N; i++ { if !new(SignatureMinPk).AggregateVerifyCompressed(agsig, true, pubks, false, msgs, dstMinPk) { b.Fatal("failed to verify") } } } } b.Run("1", run(1)) b.Run("10", run(10)) b.Run("50", run(50)) b.Run("100", run(100)) b.Run("300", run(300)) b.Run("1000", run(1000)) b.Run("4000", run(4000)) } func BenchmarkVerifyAggregateUncompressedMinPk(b *testing.B) { run := func(size int) func(b *testing.B) { return func(b *testing.B) { b.Helper() _, msgs, _, pubks, agsig, err := generateBatchTestDataUncompressedMinPk(size) if err { b.Fatal("Error generating test data") } b.ResetTimer() for i := 0; i < b.N; i++ { if !agsig.AggregateVerify(true, pubks, false, msgs, dstMinPk) { b.Fatal("failed to verify") } } } } b.Run("1", run(1)) b.Run("10", run(10)) b.Run("50", run(50)) b.Run("100", run(100)) b.Run("300", run(300)) b.Run("1000", run(1000)) b.Run("4000", run(4000)) } func BenchmarkCoreAggregateMinPk(b *testing.B) { run := func(size int) func(b *testing.B) { return func(b *testing.B) { b.Helper() _, sigs, _, _, err := generateBatchTestDataMinPk(size) if err { b.Fatal("Error generating test data") } b.ResetTimer() for i := 0; i < b.N; i++ { var agg AggregateSignatureMinPk agg.AggregateCompressed(sigs, true) } } } b.Run("1", run(1)) b.Run("10", run(10)) b.Run("50", run(50)) b.Run("100", run(100)) b.Run("300", run(300)) b.Run("1000", run(1000)) b.Run("4000", run(4000)) } func genRandomKeyMinPk() *SecretKey { // Generate 32 bytes of randomness var ikm [32]byte _, err := rand.Read(ikm[:]) if err != nil { return nil } return KeyGen(ikm[:]) } func generateBatchTestDataMinPk(size int) (msgs []Message, sigs [][]byte, pubks [][]byte, agsig []byte, err bool) { err = false for i := 0; i < size; i++ { msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) msgs = append(msgs, msg) priv := genRandomKeyMinPk() sigs = append(sigs, new(SignatureMinPk).Sign(priv, msg, dstMinPk). Compress()) pubks = append(pubks, new(PublicKeyMinPk).From(priv).Compress()) } agProj := new(AggregateSignatureMinPk) if !agProj.AggregateCompressed(sigs, true) { fmt.Println("AggregateCompressed unexpectedly returned nil") err = true return //nolint:revive } agAff := agProj.ToAffine() if agAff == nil { fmt.Println("ToAffine unexpectedly returned nil") err = true return //nolint:revive } agsig = agAff.Compress() return //nolint:revive } func generateBatchTestDataUncompressedMinPk(size int) (sks []*SecretKey, msgs []Message, sigs []*SignatureMinPk, //nolint:unparam pubks []*PublicKeyMinPk, agsig *SignatureMinPk, err bool) { err = false for i := 0; i < size; i++ { msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) msgs = append(msgs, msg) priv := genRandomKeyMinPk() sks = append(sks, priv) sigs = append(sigs, new(SignatureMinPk).Sign(priv, msg, dstMinPk)) pubks = append(pubks, new(PublicKeyMinPk).From(priv)) } agProj := new(AggregateSignatureMinPk) if !agProj.Aggregate(sigs, true) { fmt.Println("Aggregate unexpectedly returned nil") err = true return //nolint:revive } agsig = agProj.ToAffine() return //nolint:revive } func BenchmarkBatchUncompressMinPk(b *testing.B) { size := 128 var compPoints [][]byte for i := 0; i < size; i++ { msg := Message(fmt.Sprintf("blst is a blast!! %d", i)) p2 := HashToG2(msg, dstMinPk).ToAffine() compPoints = append(compPoints, p2.Compress()) } b.Run("Single", func(b *testing.B) { b.ResetTimer() b.ReportAllocs() var tmp SignatureMinPk for i := 0; i < b.N; i++ { for j := 0; j < size; j++ { if tmp.Uncompress(compPoints[j]) == nil { b.Fatal("could not uncompress point") } } } }) b.Run("Batch", func(b *testing.B) { b.ResetTimer() b.ReportAllocs() var tmp SignatureMinPk for i := 0; i < b.N; i++ { if tmp.BatchUncompress(compPoints) == nil { b.Fatal("could not batch uncompress points") } } }) } func TestSignVerifyAggregateValidatesInfinitePubkeyMinPk(t *testing.T) { t.Parallel() size := 20 sks, msgs, _, pubks, _, err := generateBatchTestDataUncompressedMinPk(size) if err { t.Error("Error generating test data") return } // All signers sign the same message sigs := make([]*SignatureMinPk, size) for i := range sigs { sigs[i] = new(SignatureMinPk).Sign(sks[i], msgs[i], dstMinPk) } // Single message: Infinite pubkeys and signature zeroKey := new(PublicKeyMinPk) zeroSig := new(SignatureMinPk) agProj := new(AggregateSignatureMinPk) if !agProj.Aggregate([]*SignatureMinPk{zeroSig}, false) { t.Error("Aggregate unexpectedly returned nil") return } agSig := agProj.ToAffine() if agSig.AggregateVerify(false, []*PublicKeyMinPk{zeroKey}, false, [][]byte{msgs[0]}, dstMinPk) { t.Error("failed to NOT verify signature") } // Replace firstkey with infinite pubkey. pubks[0] = zeroKey sigs[0] = zeroSig agProj = new(AggregateSignatureMinPk) if !agProj.Aggregate(sigs, false) { t.Error("Aggregate unexpectedly returned nil") return } agSig = agProj.ToAffine() if agSig.AggregateVerify(false, pubks, false, msgs, dstMinPk) { t.Error("failed to NOT verify signature") } } func TestEmptyMessageMinPk(t *testing.T) { t.Parallel() msg := []byte("") var sk_bytes = []byte {99, 64, 58, 175, 15, 139, 113, 184, 37, 222, 127, 204, 233, 209, 34, 8, 61, 27, 85, 251, 68, 31, 255, 214, 8, 189, 190, 71, 198, 16, 210, 91}; sk := new(SecretKey).Deserialize(sk_bytes) pk := new(PublicKeyMinPk).From(sk) sig := new(SignatureMinPk).Sign(sk, msg, dstMinPk) if !new(SignatureMinPk).VerifyCompressed(sig.Compress(), true, pk.Compress(), false, msg, dstMinPk) { t.Error("failed to verify empty message") } } func TestEmptySignatureMinPk(t *testing.T) { t.Parallel() msg := []byte("message") var sk_bytes = []byte {99, 64, 58, 175, 15, 139, 113, 184, 37, 222, 127, 204, 233, 209, 34, 8, 61, 27, 85, 251, 68, 31, 255, 214, 8, 189, 190, 71, 198, 16, 210, 91}; sk := new(SecretKey).Deserialize(sk_bytes) pk := new(PublicKeyMinPk).From(sk) var emptySig []byte if new(SignatureMinPk).VerifyCompressed(emptySig, true, pk.Compress(), false, msg, dstMinPk) { t.Error("failed to NOT verify empty signature") } } func TestMultiScalarP1(t *testing.T) { t.Parallel() const npoints = 1027 scalars := make([]byte, npoints*16) _, err := rand.Read(scalars) if err != nil { t.Error(err.Error()) return } points := make([]P1, npoints) refs := make([]P1, npoints) generator := P1Generator() for i := range points { points[i] = *generator.Mult(scalars[i*4:(i+1)*4]) refs[i] = *points[i].Mult(scalars[i*16:(i+1)*16], 128) if i < 27 { ref := P1s(refs[:i+1]).Add() ret := P1s(points[:i+1]).Mult(scalars, 128) if !ref.Equals(ret) { t.Error("failed self-consistency multi-scalar test") } } } ref := P1s(refs).Add() ret := P1s(points).Mult(scalars, 128) if !ref.Equals(ret) { t.Error("failed self-consistency multi-scalar test") } } func BenchmarkMultiScalarP1(b *testing.B) { const npoints = 200000 scalars := make([]byte, npoints*32) _, err := rand.Read(scalars) if err != nil { b.Fatal(err.Error()) } temp := make([]P1, npoints) generator := P1Generator() for i := range temp { temp[i] = *generator.Mult(scalars[i*4:(i+1)*4]) } points := P1s(temp).ToAffine() run := func(points []P1Affine) func(b *testing.B) { return func(b *testing.B) { b.Helper() for i:=0; i window*ncpus { //nolint:nestif nx = 1 wnd = bits.Len(uint(ncpus)/4) if (window + wnd) > 18 { wnd = window - wnd } else { wnd = (nbits / window + ncpus - 1) / ncpus; if (nbits / (window + 1) + ncpus - 1) / ncpus < wnd { wnd = window + 1; } else { wnd = window; } } } else { nx = 2 wnd = window-2 for (nbits/wnd+1)*nx < ncpus { nx += 1 wnd = window - bits.Len(3*uint(nx)/2) } nx -= 1 wnd = window - bits.Len(3*uint(nx)/2) } ny = nbits/wnd + 1 wnd = nbits/ny + 1 return nx, ny, wnd } func pippenger_window_size(npoints int) int { wbits := bits.Len(uint(npoints)) if wbits > 13 { return wbits - 4 } if wbits > 5 { return wbits - 3 } return 2 } ================================================ FILE: bindings/go/blst_px.tgo ================================================ func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, sig *P2Affine, sigGroupcheck bool, msg []byte, optional ...[]byte) int { // aug var aug []byte if len(optional) > 0 { aug = optional[0] } r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0], PK.asPtr(), C.bool(pkValidate), sig.asPtr(), C.bool(sigGroupcheck), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(aug), C.size_t(len(aug))) return int(r) } func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, sig *P2Affine, sigGroupcheck bool, rand *Scalar, randBits int, msg []byte, optional ...[]byte) int { // aug var aug []byte if len(optional) > 0 { aug = optional[0] } r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0], PK.asPtr(), C.bool(pkValidate), sig.asPtr(), C.bool(sigGroupcheck), &rand.cgo.b[0], C.size_t(randBits), ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(aug), C.size_t(len(aug))) return int(r) } // // Serialization/Deserialization. // // P1 Serdes func (p1 *P1Affine) Serialize() []byte { var out [BLST_P1_SERIALIZE_BYTES]byte C.blst_p1_affine_serialize((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { if len(in) != BLST_P1_SERIALIZE_BYTES { return nil } if C.blst_p1_deserialize(&p1.cgo, (*C.byte)(&in[0])) != C.BLST_SUCCESS { return nil } return p1 } func (p1 *P1Affine) Compress() []byte { var out [BLST_P1_COMPRESS_BYTES]byte C.blst_p1_affine_compress((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { if len(in) != BLST_P1_COMPRESS_BYTES { return nil } if C.blst_p1_uncompress(&p1.cgo, (*C.byte)(&in[0])) != C.BLST_SUCCESS { return nil } return p1 } func (p1 *P1Affine) InG1() bool { return bool(C.blst_p1_affine_in_g1(&p1.cgo)) } func (*P1Affine) BatchUncompress(in [][]byte) []*P1Affine { // Allocate space for all of the resulting points. Later we'll save pointers // and return those so that the result could be used in other functions, // such as MultipleAggregateVerify. n := len(in) points := make([]P1Affine, n) pointsPtrs := make([]*P1Affine, n) numThreads := numThreads(n) // Each thread will determine next message to process by atomically // incrementing curItem, process corresponding point, and // repeat until n is exceeded. Each thread will send a result (true for // success, false for failure) into the channel when complete. resCh := make(chan bool, numThreads) valid := int32(1) curItem := uint32(0) for tid := 0; tid < numThreads; tid++ { go func() { for atomic.LoadInt32(&valid) > 0 { // Get a work item work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(n) { break } if points[work].Uncompress(in[work]) == nil { atomic.StoreInt32(&valid, 0) break } pointsPtrs[work] = &points[work] } if atomic.LoadInt32(&valid) > 0 { resCh <- true } else { resCh <- false } }() } // Collect the threads result := true for i := 0; i < numThreads; i++ { if ! <-resCh { result = false } } if atomic.LoadInt32(&valid) == 0 || !result { return nil } return pointsPtrs } func (p1 *P1) Serialize() []byte { var out [BLST_P1_SERIALIZE_BYTES]byte C.blst_p1_serialize((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1) Compress() []byte { var out [BLST_P1_COMPRESS_BYTES]byte C.blst_p1_compress((*C.byte)(&out[0]), &p1.cgo) return out[:] } func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 { var nbits int var scalar *C.byte switch val := scalarIf.(type) { case []byte: scalar = (*C.byte)(&val[0]) nbits = len(val)*8 case *Scalar: scalar = &val.cgo.b[0] nbits = 255 default: panic(fmt.Sprintf("unsupported type %T", val)) } if len(optional) > 0 { nbits = optional[0] } C.blst_p1_mult(&p1.cgo, &p1.cgo, scalar, C.size_t(nbits)) return p1 } func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 { ret := *p1 return ret.MultAssign(scalarIf, optional...) } func (p1 *P1) AddAssign(pointIf interface{}) *P1 { switch val := pointIf.(type) { case *P1: C.blst_p1_add_or_double(&p1.cgo, &p1.cgo, &val.cgo) case *P1Affine: C.blst_p1_add_or_double_affine(&p1.cgo, &p1.cgo, &val.cgo) default: panic(fmt.Sprintf("unsupported type %T", val)) } return p1 } func (p1 *P1) Add(pointIf interface{}) *P1 { ret := *p1 return ret.AddAssign(pointIf) } func (p1 *P1) SubAssign(pointIf interface{}) *P1 { var x *C.blst_fp var affine C.bool switch val := pointIf.(type) { case *P1: x = &val.cgo.x affine = false case *P1Affine: x = &val.cgo.x affine = true default: panic(fmt.Sprintf("unsupported type %T", val)) } C.go_p1_sub_assign(&p1.cgo, x, affine) return p1 } func (p1 *P1) Sub(pointIf interface{}) *P1 { ret := *p1 return ret.SubAssign(pointIf) } func P1Generator() *P1 { return &cgo_p1Generator } // 'acc += point * scalar', passing 'nil' for 'point' means "use the // group generator point" func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{}, optional ...int) *P1 { var x *C.blst_fp var affine C.bool if pointIf != nil { switch val := pointIf.(type) { case *P1: x = &val.cgo.x affine = false case *P1Affine: x = &val.cgo.x affine = true default: panic(fmt.Sprintf("unsupported type %T", val)) } } var nbits int var scalar *C.byte switch val := scalarIf.(type) { case []byte: scalar = (*C.byte)(&val[0]) nbits = len(val)*8 case *Scalar: scalar = &val.cgo.b[0] nbits = 255 default: panic(fmt.Sprintf("unsupported type %T", val)) } if len(optional) > 0 { nbits = optional[0] } C.go_p1_mult_n_acc(&acc.cgo, x, affine, scalar, C.size_t(nbits)) return acc } // // Affine // func (p *P1) ToAffine() *P1Affine { var pa P1Affine C.blst_p1_to_affine(&pa.cgo, &p.cgo) return &pa } func (p *P1) FromAffine(pa *P1Affine) { C.blst_p1_from_affine(&p.cgo, &pa.cgo) } // // Hash // func HashToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { // aug var q P1 var aug []byte if len(optional) > 0 { aug = optional[0] } C.blst_hash_to_g1(&q.cgo, ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug))) return &q } func EncodeToG1(msg []byte, dst []byte, optional ...[]byte) *P1 { // aug var q P1 var aug []byte if len(optional) > 0 { aug = optional[0] } C.blst_encode_to_g1(&q.cgo, ptrOrNil(msg), C.size_t(len(msg)), ptrOrNil(dst), C.size_t(len(dst)), ptrOrNil(aug), C.size_t(len(aug))) return &q } // // Multi-point/scalar operations // func P1sToAffine(points []*P1, optional ...int) P1Affines { var npoints int if len(optional) > 0 { npoints = optional[0] } else { npoints = len(points) } ret := make([]P1Affine, npoints) _cgoCheckPointer := func(...interface{}) {} C.blst_p1s_to_affine(&ret[0].cgo, (**C.blst_p1)(unsafe.Pointer(&points[0])), C.size_t(npoints)) return ret } func (points P1s) ToAffine(optional ...P1Affines) P1Affines { npoints := len(points) var ret P1Affines if len(optional) > 0 { // used in benchmark ret = optional[0] if len(ret) < npoints { panic("npoints mismatch") } } else { ret = make([]P1Affine, npoints) } if maxProcs < 2 || npoints < 768 { C.go_p1slice_to_affine(&ret[0].cgo, &points[0].cgo, C.size_t(npoints)) return ret } nslices := (npoints + 511) / 512 if nslices > maxProcs { nslices = maxProcs } delta, rem := npoints/nslices + 1, npoints%nslices var wg sync.WaitGroup wg.Add(nslices) for x := 0; x < npoints; x += delta { if rem == 0 { delta -= 1 } rem -= 1 go func(out *P1Affine, inp *P1, delta int) { C.go_p1slice_to_affine(&out.cgo, &inp.cgo, C.size_t(delta)) wg.Done() }(&ret[x], &points[x], delta) } wg.Wait() return ret } // // Batch addition // func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 { var npoints int if len(optional) > 0 { npoints = optional[0] } else { npoints = len(points) } var ret P1 _cgoCheckPointer := func(...interface{}) {} C.blst_p1s_add(&ret.cgo, (**C.blst_p1_affine)(unsafe.Pointer(&points[0])), C.size_t(npoints)) return &ret } func (points P1Affines) Add() *P1 { npoints := len(points) if maxProcs < 2 || npoints < 768 { var ret P1 C.go_p1slice_add(&ret.cgo, &points[0].cgo, C.size_t(npoints)) return &ret } nslices := (npoints + 511) / 512 if nslices > maxProcs { nslices = maxProcs } delta, rem := npoints/nslices + 1, npoints%nslices msgs := make(chan P1, nslices) for x := 0; x < npoints; x += delta { if rem == 0 { delta -= 1 } rem -= 1 go func(points *P1Affine, delta int) { var ret P1 C.go_p1slice_add(&ret.cgo, &points.cgo, C.size_t(delta)) msgs <- ret }(&points[x], delta) } ret := <- msgs for i := 1; i < nslices; i++ { msg := <- msgs C.blst_p1_add_or_double(&ret.cgo, &ret.cgo, &msg.cgo) } return &ret } func (points P1s) Add() *P1 { return points.ToAffine().Add() } // // Multi-scalar multiplication // func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 { var npoints int switch val := pointsIf.(type) { case []*P1Affine: npoints = len(val) case []P1Affine: npoints = len(val) case P1Affines: npoints = len(val) default: panic(fmt.Sprintf("unsupported type %T", val)) } nbytes := (nbits+7)/8 var scalars []*C.byte switch val := scalarsIf.(type) { case []byte: if len(val) < npoints*nbytes { return nil } case [][]byte: if len(val) < npoints { return nil } scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = (*C.byte)(&val[i][0]) } case []Scalar: if len(val) < npoints { return nil } if nbits <= 248 { scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = &val[i].cgo.b[0] } } case []*Scalar: if len(val) < npoints { return nil } scalars = make([]*C.byte, npoints) for i := range scalars { scalars[i] = &val[i].cgo.b[0] } default: panic(fmt.Sprintf("unsupported type %T",val)) } numThreads := numThreads(0) if numThreads < 2 { sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints)))/8 scratch := make([]uint64, sz) pointsBySlice := [2]*C.blst_p1_affine{nil, nil} var p_points **C.blst_p1_affine switch val := pointsIf.(type) { case []*P1Affine: p_points = (**C.blst_p1_affine)(unsafe.Pointer(&val[0])) case []P1Affine: pointsBySlice[0] = &val[0].cgo p_points = &pointsBySlice[0] case P1Affines: pointsBySlice[0] = &val[0].cgo p_points = &pointsBySlice[0] default: // type is already vetted } scalarsBySlice := [2]*C.byte{nil, nil} var p_scalars **C.byte switch val := scalarsIf.(type) { case []byte: scalarsBySlice[0] = (*C.byte)(&val[0]) p_scalars = &scalarsBySlice[0] case [][]byte: p_scalars = &scalars[0] case []Scalar: if nbits > 248 { scalarsBySlice[0] = &val[0].cgo.b[0] p_scalars = &scalarsBySlice[0] } else { p_scalars = &scalars[0] } case []*Scalar: p_scalars = &scalars[0] default: // type is already vetted } var ret P1 _cgoCheckPointer := func(...interface{}) {} C.blst_p1s_mult_pippenger(&ret.cgo, p_points, C.size_t(npoints), p_scalars, C.size_t(nbits), (*C.limb_t)(&scratch[0])) for i := range(scalars) { scalars[i] = nil } return &ret } if npoints < 32 { if numThreads > npoints { numThreads = npoints } curItem := uint32(0) msgs := make(chan P1, numThreads) for tid := 0; tid < numThreads; tid++ { go func() { var acc P1 for { workItem := int(atomic.AddUint32(&curItem, 1) - 1) if workItem >= npoints { break } var point *P1Affine switch val := pointsIf.(type) { case []*P1Affine: point = val[workItem] case []P1Affine: point = &val[workItem] case P1Affines: point = &val[workItem] default: // type is already vetted } var scalar *C.byte switch val := scalarsIf.(type) { case []byte: scalar = (*C.byte)(&val[workItem*nbytes]) case [][]byte: scalar = scalars[workItem] case []Scalar: if nbits > 248 { scalar = &val[workItem].cgo.b[0] } else { scalar = scalars[workItem] } case []*Scalar: scalar = scalars[workItem] default: // type is already vetted } C.go_p1_mult_n_acc(&acc.cgo, &point.cgo.x, true, scalar, C.size_t(nbits)) } msgs <- acc }() } ret := <-msgs for tid := 1; tid < numThreads; tid++ { point := <- msgs C.blst_p1_add_or_double(&ret.cgo, &ret.cgo, &point.cgo); } for i := range(scalars) { scalars[i] = nil } return &ret } // this is sizeof(scratch[0]) sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0))/8 nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), numThreads) // |grid[]| holds "coordinates" and place for result grid := make([]struct { x, dx, y, dy int point P1 }, nx*ny) dx := npoints/nx y := window*(ny-1) total := 0 for ; total < nx; total++ { grid[total].x = total*dx grid[total].dx = dx grid[total].y = y grid[total].dy = nbits - y } grid[total-1].dx = npoints - grid[total-1].x for y > 0 { y -= window for i := 0; i < nx; i++ { grid[total].x = grid[i].x grid[total].dx = grid[i].dx grid[total].y = y grid[total].dy = window total++ } } if numThreads > total { numThreads = total } msgsCh := make(chan int, ny) rowSync := make([]int32, ny) // count up to |nx| curItem := int32(0) for tid := 0; tid < numThreads; tid++ { go func() { scratch := make([]uint64, sz << uint(window-1)) pointsBySlice := [2]*C.blst_p1_affine{nil, nil} scalarsBySlice := [2]*C.byte{nil, nil} _cgoCheckPointer := func(...interface{}) {} for { workItem := atomic.AddInt32(&curItem, 1) - 1 if int(workItem) >= total { break } x := grid[workItem].x y := grid[workItem].y var p_points **C.blst_p1_affine switch val := pointsIf.(type) { case []*P1Affine: p_points = (**C.blst_p1_affine)(unsafe.Pointer(&val[x])) case []P1Affine: pointsBySlice[0] = &val[x].cgo p_points = &pointsBySlice[0] case P1Affines: pointsBySlice[0] = &val[x].cgo p_points = &pointsBySlice[0] default: // type is already vetted } var p_scalars **C.byte switch val := scalarsIf.(type) { case []byte: scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) p_scalars = &scalarsBySlice[0] case [][]byte: p_scalars = &scalars[x] case []Scalar: if nbits > 248 { scalarsBySlice[0] = &val[x].cgo.b[0] p_scalars = &scalarsBySlice[0] } else { p_scalars = &scalars[x] } case []*Scalar: p_scalars = &scalars[x] default: // type is already vetted } C.blst_p1s_tile_pippenger(&grid[workItem].point.cgo, p_points, C.size_t(grid[workItem].dx), p_scalars, C.size_t(nbits), (*C.limb_t)(&scratch[0]), C.size_t(y), C.size_t(window)); if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { msgsCh <- y // "row" is done } else { runtime.Gosched() // be nice to the application } } pointsBySlice[0] = nil scalarsBySlice[0] = nil }() } var ret P1 rows := make([]bool, ny) row := 0 // actually index in |grid[]| for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" y := <- msgsCh rows[y/window] = true // mark the "row" for grid[row].y == y { // if it's current "row", process it for row < total && grid[row].y == y { C.blst_p1_add_or_double(&ret.cgo, &ret.cgo, &grid[row].point.cgo) row++ } if y == 0 { break // one can as well 'return &ret' here } for j := 0; j < window; j++ { C.blst_p1_double(&ret.cgo, &ret.cgo) } y -= window if !rows[y/window] { // see if next "row" was marked already break } } } for i := range(scalars) { scalars[i] = nil } return &ret } func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 { return P1AffinesMult(points, scalarsIf, nbits) } func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 { return points.ToAffine().Mult(scalarsIf, nbits) } // // Group-check // func P1AffinesValidate(pointsIf interface{}) bool { var npoints int switch val := pointsIf.(type) { case []*P1Affine: npoints = len(val) case []P1Affine: npoints = len(val) case P1Affines: npoints = len(val) default: panic(fmt.Sprintf("unsupported type %T", val)) } numThreads := numThreads(npoints) if numThreads < 2 { for i := 0; i < npoints; i++ { var point *P1Affine switch val := pointsIf.(type) { case []*P1Affine: point = val[i] case []P1Affine: point = &val[i] case P1Affines: point = &val[i] default: panic(fmt.Sprintf("unsupported type %T", val)) } if !C.go_p1_affine_validate(&point.cgo, true) { return false } } return true } valid := int32(1) curItem := uint32(0) var wg sync.WaitGroup wg.Add(numThreads) for tid := 0; tid < numThreads; tid++ { go func() { for atomic.LoadInt32(&valid) != 0 { work := atomic.AddUint32(&curItem, 1) - 1 if work >= uint32(npoints) { break } var point *P1Affine switch val := pointsIf.(type) { case []*P1Affine: point = val[work] case []P1Affine: point = &val[work] case P1Affines: point = &val[work] default: panic(fmt.Sprintf("unsupported type %T", val)) } if !C.go_p1_affine_validate(&point.cgo, true) { atomic.StoreInt32(&valid, 0) break } } wg.Done() }() } wg.Wait() return atomic.LoadInt32(&valid) != 0 } func (points P1Affines) Validate() bool { return P1AffinesValidate(points) } ================================================ FILE: bindings/go/blst_wasm.go ================================================ //go:build wasm package not_supported ================================================ FILE: bindings/go/cgo_assembly.S ================================================ #include "assembly.S" ================================================ FILE: bindings/go/cgo_server.c ================================================ #include "server.c" ================================================ FILE: bindings/go/generate.py ================================================ #!/usr/bin/env python3 import os import sys import re import subprocess here = re.split(r'/(?=[^/]*$)', sys.argv[0]) if len(here) > 1: os.chdir(here[0]) for dir in re.split(r':', os.getenv("GOPATH")): goimports = dir + "/bin/goimports" if os.path.isfile(goimports) and os.access(goimports, os.X_OK): break goimports = None if goimports is None: version = subprocess.check_output(["go", "version"]).decode('ascii') v = re.search(r'version go([0-9]+\.[0-9]+)', version) if not v: raise OSError(2, "unparseable output from 'go version'") if float(v.group(1)) < 1.17: advice = "'go get golang.org/x/tools/cmd/goimports'" else: advice = "'go install golang.org/x/tools/cmd/goimports@latest'" print("'goimports' is not found on $GOPATH, install with", file=sys.stderr) print(advice, file=sys.stderr) sys.exit(1) outFile = 'blst.go' def concatFile(fout, fin, removeImports): for line in fin: if removeImports and 'import' in line: while ')' not in line: line = fin.readline() continue print(line, file=fout, end='') def remap(fout, fin, mapping, dont_touch, removeImports): for line in fin: if removeImports and 'import' in line: while ')' not in line: line = fin.readline() continue for (a, b) in dont_touch: line = line.replace(a, b) for (a, b) in mapping: line = line.replace(a, a+"_tmp") line = line.replace(b, b+"_tmp") line = line.replace(a+"_tmp", b) line = line.replace(b+"_tmp", a) for (a, b) in dont_touch: line = line.replace(b, a) print(line, file=fout, end='') fout = open(outFile, "w") print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) print("// DO NOT MODIFY THIS FILE!!", file=fout) print("// The file is generated from *.tgo by " + here[-1], file=fout) print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) fin = open('blst.tgo', "r") concatFile(fout, fin, False) fin.close() # min-pk print("//", file=fout) print("// MIN-PK", file=fout) print("//", file=fout) fin = open('blst_minpk.tgo', "r") concatFile(fout, fin, True) fin.close() # These are strings that overlap with the mapping names but we don't # actually want to change. The second value should be a unique string. dont_touch = (('Fp12', 'foo1234'),) # We're going to swap these names to get from min-pk to min-sig mapping = [('P1', 'P2'), ('p1', 'p2'), ('Fp', 'Fp2'), ('C.blst_fp', 'C.blst_fp2'), ('G1', 'G2'), ('g1', 'g2') ] # min-sig print("//", file=fout) print("// MIN-SIG", file=fout) print("//", file=fout) with open('blst_minpk.tgo', "r") as fin: remap(fout, fin, mapping, dont_touch, True) # serdes and other functions fin = open('blst_px.tgo', "r") concatFile(fout, fin, True) fin.close() with open('blst_px.tgo', "r") as fin: remap(fout, fin, mapping, dont_touch, True) # final code fin = open('blst_misc.tgo', "r") concatFile(fout, fin, True) fin.close() fout.close() # Use goimports to generate the import list os.system(goimports + " -w blst.go") # Generate min-sig tests fout = open('blst_minsig_test.go', "w") print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) print("// DO NOT EDIT THIS FILE!!", file=fout) print("// The file is generated from blst_minpk_test.go by " + here[-1], file=fout) print("// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fout) mapping.append(('MinPk', 'MinSig')) with open('blst_minpk_test.go', "r") as fin: remap(fout, fin, mapping, dont_touch, False) fout.close() ================================================ FILE: bindings/go/rb_tree.go ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ /* * Reimplement rb_tree.c, because C.call overhead is too high in * comparison to tree insertion subroutine. */ package blst import "bytes" /* * Red-black tree tailored for uniqueness test. Amount of messages to be * checked is known prior context initialization, implementation is * insert-only, failure is returned if message is already in the tree. */ const red, black bool = true, false type node struct { leafs [2]*node data *[]byte colour bool } type rbTree struct { root *node nnodes uint nodes []node } func (tree *rbTree) insert(data *[]byte) bool { var nodes [64]*node /* visited nodes */ var dirs [64]byte /* taken directions */ var k uint /* walked distance */ for p := tree.root; p != nil && k < 64; k++ { cmp := bytes.Compare(*data, *p.data) if cmp == 0 { return false /* already in tree, no insertion */ } /* record the step */ nodes[k] = p if cmp > 0 { dirs[k] = 1 } else { dirs[k] = 0 } p = p.leafs[dirs[k]] } if k == 64 { return false } /* allocate new node */ z := &tree.nodes[tree.nnodes]; tree.nnodes++ z.data = data z.colour = red /* graft |z| */ if k > 0 { nodes[k-1].leafs[dirs[k-1]] = z } else { tree.root = z } /* re-balance |tree| */ for k >= 2 /* && IS_RED(y = nodes[k-1]) */ { y := nodes[k-1] if y.colour == black { //nolint:staticcheck break } ydir := dirs[k-2] x := nodes[k-2] /* |z|'s grandparent */ s := x.leafs[ydir^1] /* |z|'s uncle */ if s != nil && s.colour == red { //nolint:staticcheck,revive x.colour = red y.colour = black s.colour = black k -= 2 } else { if dirs[k-1] != ydir { /* | | * x x * / \ \ * y s -> z s * \ / * z y * / \ * ? ? */ t := y y = y.leafs[ydir^1] t.leafs[ydir^1] = y.leafs[ydir] y.leafs[ydir] = t } /* | | * x y * \ / \ * y s -> z x * / \ / \ * z ? ? s */ x.leafs[ydir] = y.leafs[ydir^1] y.leafs[ydir^1] = x x.colour = red y.colour = black if k > 2 { nodes[k-3].leafs[dirs[k-3]] = y } else { tree.root = y } break } } tree.root.colour = black return true } func Uniq(msgs []Message) bool { n := len(msgs) if n == 1 { //nolint:staticcheck return true } else if n == 2 { return !bytes.Equal(msgs[0], msgs[1]) } var tree rbTree tree.nodes = make([]node, n) for i := 0; i < n; i++ { if !tree.insert(&msgs[i]) { return false } } return true } ================================================ FILE: bindings/rust/Cargo.toml ================================================ [package] name = "blst" version = "0.3.16" authors = ["sean-sn "] edition = "2018" license = "Apache-2.0" description = "Bindings for blst BLS12-381 library" repository = "https://github.com/supranational/blst" readme = "README.md" categories = ["cryptography"] keywords = ["crypto", "bls", "signature", "asm", "wasm"] include = [ "**/*.rs", "/Cargo.toml", "/README.md", "/rustfmt.toml", "/blst/src/*.c", "/blst/src/*.h*", "/blst/build/**", "/blst/bindings/blst.h", "/blst/bindings/blst_aux.h", "/blst/bindings/blst.hpp", ] links = "blst" [features] # By default, compile with ADX extension if the host supports it. # Binary can be executed on systems similar to the host. default = [] # Compile in portable mode, without ISA extensions. # Binary can be executed on all systems. portable = [] # Enable ADX even if the host CPU doesn't support it. # Binary can be executed on Broadwell+ and Ryzen+ systems. force-adx = [] # Suppress multi-threading. # Engaged on wasm32 target architecture automatically. no-threads = [] # Add support for serializing SecretKey, not suitable for production. serde-secret = ["serde"] [build-dependencies] cc = "1.0" [target.'cfg(target_env = "msvc")'.build-dependencies] glob = "0.3" [dependencies] zeroize = { version = "^1.1", features = ["zeroize_derive"] } serde = { version = "1.0.152", optional = true } [target.'cfg(not(any(target_arch="wasm32", target_os="none", target_os="unknown", target_os="uefi")))'.dependencies] threadpool = "^1.8.1" [dev-dependencies] rand = "0.8" rand_chacha = "0.3" rmp-serde = "1.1.1" # Uncomment if you want to execute the test suite with Rust 1.56 through 1.64. #byteorder = "=1.4.3" #rmp = "=0.8.12" #ppv-lite86 = "=0.2.17" [target.'cfg(any(unix, windows))'.dev-dependencies] criterion = "0.3" [[bench]] name = "blst_benches" harness = false [profile.release] #opt-level = 3 [badges] maintenance = { status = "actively-developed" } ================================================ FILE: bindings/rust/README.md ================================================ # blst [![Crates.io](https://img.shields.io/crates/v/blst.svg)](https://crates.io/crates/blst) The `blst` crate provides a rust interface to the blst BLS12-381 signature library. ## Build [bindgen](https://github.com/rust-lang/rust-bindgen) is used to generate FFI bindings to blst.h. Then [build.rs](https://github.com/supranational/blst/blob/master/bindings/rust/build.rs) invokes C compiler to compile everything into libblst.a within the rust target build area. On Linux it's possible to choose compiler by setting `CC` environment variable. Everything can be built and run with the typical cargo commands: ``` cargo test cargo bench ``` If the target application crashes with an "illegal instruction" exception [after copying to an older system], activate `portable` feature when building blst. Conversely, if you compile on an older Intel system, but will execute the binary on a newer one, consider instead activating `force-adx` feature. Though keep in mind that [cc](https://crates.io/crates/cc) passes the value of `CFLAGS` environment variable to the C compiler, and if set to contain specific flags, it can interfere with feature selection. `-D__BLST_PORTABLE__` and `-D__ADX__` are the said features' equivalents. To compile for WebAssembly, your clang has to recognize `--target=wasm32`. Alternatively you can build your project with `CC` environment variable set to `emcc`, the [Emscripten compiler](https://emscripten.org), and `AR` set to `emar`, naturally, with both commands available on your `PATH`. While `cargo test`'s dependencies happen to require at least Rust 1.65, the library by itself can be compiled with earlier compiler versions. Though in order to use Rust version prior 1.56 you would need to pin`zeroize` to "=1.3.0" and `zeroize_derive` to "=1.3.3" in **your** project Cargo.toml. Even `cc` might require pinning to "=1.0.79". And if you find yourself with Rust 1.56 through 1.64 as the only option and want to execute `cargo test` you'd need to pin some of `[dev-dependencies]` versions in **this** project's Cargo.toml by uncommenting following lines and commenting `criterion`: ``` byteorder = "=1.4.3" ppv-lite86 = "=0.2.17" rmp = "=0.8.12" [target.'cfg(any(unix, windows))'.dev-dependencies] #criterion = "0.3" ``` ## Usage There are two primary modes of operation that can be chosen based on declaration path: For minimal-pubkey-size operations: ```rust use blst::min_pk::*; ``` For minimal-signature-size operations: ```rust use blst::min_sig::*; ``` There are five structs with inherent implementations that provide the BLS12-381 signature functionality. ``` SecretKey PublicKey AggregatePublicKey Signature AggregateSignature ``` A simple example for generating a key, signing a message, and verifying the message: ```rust use blst::min_pk::SecretKey; let mut rng = rand::thread_rng(); let mut ikm = [0u8; 32]; rng.fill_bytes(&mut ikm); let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); let pk = sk.sk_to_pk(); let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; let msg = b"blst is such a blast"; let sig = sk.sign(msg, dst, &[]); let err = sig.verify(true, msg, dst, &[], &pk, true); assert_eq!(err, blst::BLST_ERROR::BLST_SUCCESS); ``` See the tests in src/lib.rs and benchmarks in benches/blst_benches.rs for further examples of usage. ================================================ FILE: bindings/rust/benches/blst_benches.rs ================================================ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 use blst::*; // Benchmark min_pk use blst::min_pk::*; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use rand::{RngCore, SeedableRng}; use rand_chacha::ChaCha20Rng; struct BenchData { sk: SecretKey, pk: PublicKey, msg: Vec, dst: Vec, sig: Signature, } fn gen_bench_data(rng: &mut rand_chacha::ChaCha20Rng) -> BenchData { let msg_len = (rng.next_u64() & 0x3F) + 1; let mut msg = vec![0u8; msg_len as usize]; rng.fill_bytes(&mut msg); gen_bench_data_for_msg(rng, &msg) } fn gen_bench_data_for_msg( rng: &mut rand_chacha::ChaCha20Rng, msg: &Vec, ) -> BenchData { let mut ikm = [0u8; 32]; rng.fill_bytes(&mut ikm); let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); let pk = sk.sk_to_pk(); let dst = "BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_" .as_bytes() .to_owned(); let sig = sk.sign(&msg, &dst, &[]); let bd = BenchData { sk, pk, dst, msg: msg.clone(), sig, }; bd } fn bench_verify_multi_aggregate(c: &mut Criterion) { let mut group = c.benchmark_group("verify_multi_aggregate"); let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; let mut ikm = [0u8; 32]; let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let num_sigs = vec![8, 16, 32, 64, 128]; let pks_per_sig = 3; for n in num_sigs.iter() { let mut msgs: Vec> = vec![vec![]; *n]; let mut sigs: Vec = Vec::with_capacity(*n); let mut pks: Vec = Vec::with_capacity(*n); let mut rands: Vec = Vec::with_capacity(*n); for i in 0..*n { // Create public keys rng.fill_bytes(&mut ikm); let sks_i: Vec<_> = (0..pks_per_sig) .map(|_| { ikm[0] += 1; SecretKey::key_gen(&ikm, &[]).unwrap() }) .collect(); let pks_i = sks_i.iter().map(|sk| sk.sk_to_pk()).collect::>(); let pks_refs_i: Vec<&PublicKey> = pks_i.iter().map(|pk| pk).collect(); // Create random message for pks to all sign let msg_len = (rng.next_u64() & 0x3F) + 1; msgs[i] = vec![0u8; msg_len as usize]; rng.fill_bytes(&mut msgs[i]); // Generate signature for each key pair let sigs_i = sks_i .iter() .map(|sk| sk.sign(&msgs[i], dst, &[])) .collect::>(); // Aggregate signature let sig_refs_i = sigs_i.iter().map(|s| s).collect::>(); let agg_i = match AggregateSignature::aggregate(&sig_refs_i, false) { Ok(agg_i) => agg_i, Err(err) => panic!("aggregate failure: {:?}", err), }; sigs.push(agg_i.to_signature()); // aggregate public keys and push into vec let agg_pk_i = match AggregatePublicKey::aggregate(&pks_refs_i, false) { Ok(agg_pk_i) => agg_pk_i, Err(err) => panic!("aggregate failure: {:?}", err), }; pks.push(agg_pk_i.to_public_key()); // create random values let mut vals = [0u64; 4]; vals[0] = rng.next_u64(); let mut rand_i = std::mem::MaybeUninit::::uninit(); unsafe { blst_scalar_from_uint64(rand_i.as_mut_ptr(), vals.as_ptr()); rands.push(rand_i.assume_init()); } } let msgs_refs: Vec<&[u8]> = msgs.iter().map(|m| m.as_slice()).collect(); let sig_refs = sigs.iter().map(|s| s).collect::>(); let pks_refs: Vec<&PublicKey> = pks.iter().map(|pk| pk).collect(); let agg_ver = (sig_refs, pks_refs, msgs_refs, dst, rands); group.bench_with_input( BenchmarkId::new("verify_multi_aggregate", n), &agg_ver, |b, (s, p, m, d, r)| { b.iter(|| { let result = Signature::verify_multiple_aggregate_signatures( &m, *d, &p, false, &s, false, &r, 64, ); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); }); }, ); } group.finish(); } fn bench_fast_aggregate_verify(c: &mut Criterion) { let mut group = c.benchmark_group("fast_aggregate_verify"); let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let msg_len = (rng.next_u64() & 0x3F) + 1; let mut msg = vec![0u8; msg_len as usize]; rng.fill_bytes(&mut msg); let sizes = vec![8, 16, 32, 64, 128]; let bds: Vec<_> = (0..sizes[sizes.len() - 1]) .map(|_| gen_bench_data_for_msg(&mut rng, &msg)) .collect(); for size in sizes.iter() { let pks_refs = bds .iter() .take(*size) .map(|s| &s.pk) .collect::>(); let sig_refs = bds .iter() .take(*size) .map(|s| &s.sig) .collect::>(); let agg = match AggregateSignature::aggregate(&sig_refs, false) { Ok(agg) => agg, Err(err) => panic!("aggregate failure: {:?}", err), }; let agg_sig = agg.to_signature(); let agg_pks = match AggregatePublicKey::aggregate(&pks_refs, false) { Ok(agg_pks) => agg_pks, Err(err) => panic!("aggregate failure: {:?}", err), }; let agg_pk = agg_pks.to_public_key(); let agg_ver = (agg_sig, pks_refs, &bds[0].msg, &bds[0].dst); let agg_pre_ver = (agg_sig, agg_pk, &bds[0].msg, &bds[0].dst); group.bench_with_input( BenchmarkId::new("fast_aggregate_verify", size), &agg_ver, |b, (a, p, m, d)| { b.iter(|| { let result = a.fast_aggregate_verify(true, &m, &d, &p); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); }); }, ); group.bench_with_input( BenchmarkId::new("fast_aggregate_verify_preagg", size), &agg_pre_ver, |b, (a, p, m, d)| { b.iter(|| { let result = a .fast_aggregate_verify_pre_aggregated(true, &m, &d, &p); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); }); }, ); } group.finish(); } fn bench_aggregate_verify(c: &mut Criterion) { let mut group = c.benchmark_group("aggregate_verify"); let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let sizes = vec![8, 16, 32, 64, 128]; // [10, 50, 100, 300, 1000, 4000]; let bds: Vec<_> = (0..sizes[sizes.len() - 1]) .map(|_| gen_bench_data(&mut rng)) .collect(); for size in sizes.iter() { let msgs_refs = bds .iter() .take(*size) .map(|s| s.msg.as_slice()) .collect::>(); let pks_refs = bds .iter() .take(*size) .map(|s| &s.pk) .collect::>(); let sig_refs = bds .iter() .take(*size) .map(|s| &s.sig) .collect::>(); let agg = match AggregateSignature::aggregate(&sig_refs, false) { Ok(agg) => agg, Err(err) => panic!("aggregate failure: {:?}", err), }; let agg_sig = agg.to_signature(); let agg_ver = (agg_sig, pks_refs, msgs_refs, &bds[0].dst); group.bench_with_input( BenchmarkId::new("aggregate_verify", size), &agg_ver, |b, (a, p, m, d)| { b.iter(|| { let result = a.aggregate_verify(true, &m, &d, &p, false); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); }); }, ); } group.finish(); } fn bench_aggregate(c: &mut Criterion) { let mut group = c.benchmark_group("aggregate"); let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let sizes: [usize; 6] = [10, 50, 100, 300, 1000, 4000]; let bds: Vec<_> = (0..4000).map(|_| gen_bench_data(&mut rng)).collect(); for size in sizes.iter() { let sig_refs = bds .iter() .take(*size) .map(|s| &s.sig) .collect::>(); group.bench_with_input( BenchmarkId::new("aggregate_signature", size), &sig_refs, |b, s| { b.iter(|| AggregateSignature::aggregate(&s, false)); }, ); let pks_refs = bds .iter() .take(*size) .map(|s| &s.pk) .collect::>(); group.bench_with_input( BenchmarkId::new("aggregate_public_key", size), &pks_refs, |b, p| { b.iter(|| AggregatePublicKey::aggregate(&p, false)); }, ); } group.finish(); } fn bench_single_message(c: &mut Criterion) { let mut group = c.benchmark_group("single_message"); let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let bd = gen_bench_data(&mut rng); group.bench_function("sign", |b| { b.iter(|| bd.sk.sign(&bd.msg, &bd.dst, &[])) }); group.bench_function("verify", |b| { b.iter(|| bd.sig.verify(true, &bd.msg, &bd.dst, &[], &bd.pk, false)) }); group.finish(); } fn bench_serdes(c: &mut Criterion) { let mut group = c.benchmark_group("serdes"); let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let bd = gen_bench_data(&mut rng); let sk = bd.sk; let sk_ser = sk.serialize(); let pk = bd.pk; let pk_comp = pk.compress(); let pk_ser = pk.serialize(); let sig = bd.sig; let sig_comp = sig.compress(); let sig_ser = sig.serialize(); let mut pk_jac = std::mem::MaybeUninit::::uninit(); let mut sig_jac = std::mem::MaybeUninit::::uninit(); let mut p1_comp = [0; 48]; let mut p2_comp = [0; 96]; let mut p1_ser = [0; 96]; let mut p2_ser = [0; 192]; unsafe { let mut junk = [0u8; 32]; rng.fill_bytes(&mut junk); blst_encode_to_g1( pk_jac.as_mut_ptr(), junk.as_ptr(), junk.len(), "junk".as_ptr(), 4, std::ptr::null(), 0, ); blst_encode_to_g2( sig_jac.as_mut_ptr(), junk.as_ptr(), junk.len(), "junk".as_ptr(), 4, std::ptr::null(), 0, ); } group.bench_function("secret_key_serialize", |b| b.iter(|| sk.serialize())); group.bench_function("secret_key_deserialize", |b| { b.iter(|| SecretKey::deserialize(&sk_ser)); }); group.bench_function("public_key_serialize", |b| b.iter(|| pk.serialize())); group.bench_function("public_key_compress", |b| b.iter(|| pk.compress())); group.bench_function("public_key_uncompress", |b| { b.iter(|| PublicKey::uncompress(&pk_comp)) }); group.bench_function("public_key_deserialize", |b| { b.iter(|| PublicKey::deserialize(&pk_ser)); }); group.bench_function("signature_serialize", |b| b.iter(|| sig.serialize())); group.bench_function("signature_compress", |b| b.iter(|| sig.compress())); group.bench_function("signature_uncompress", |b| { b.iter(|| Signature::uncompress(&sig_comp)) }); group.bench_function("signature_deserialize", |b| { b.iter(|| Signature::deserialize(&sig_ser)) }); group.bench_function("p1_serialize", |b| { b.iter(|| unsafe { blst_p1_serialize(p1_ser.as_mut_ptr(), pk_jac.as_ptr()) }) }); group.bench_function("p1_compress", |b| { b.iter(|| unsafe { blst_p1_compress(p1_comp.as_mut_ptr(), pk_jac.as_ptr()) }) }); group.bench_function("p2_serialize", |b| { b.iter(|| unsafe { blst_p2_serialize(p2_ser.as_mut_ptr(), sig_jac.as_ptr()) }) }); group.bench_function("p2_compress", |b| { b.iter(|| unsafe { blst_p2_compress(p2_comp.as_mut_ptr(), sig_jac.as_ptr()) }) }); group.finish(); } fn bench_keys(c: &mut Criterion) { let mut group = c.benchmark_group("keys"); let ikm: [u8; 32] = [ 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99, ]; let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); let pk = sk.sk_to_pk(); let pk_comp = pk.compress(); group.bench_function("key_gen", |b| { b.iter(|| SecretKey::key_gen(&ikm, &[])) }); group.bench_function("sk_to_pk", |b| { b.iter(|| sk.sk_to_pk()); }); group.bench_function("key_validate", |b| { b.iter(|| PublicKey::key_validate(&pk_comp)); }); group.finish(); } criterion_group!( benches, bench_verify_multi_aggregate, bench_fast_aggregate_verify, bench_aggregate_verify, bench_aggregate, bench_single_message, bench_serdes, bench_keys ); criterion_main!(benches); ================================================ FILE: bindings/rust/build.rs ================================================ #![allow(unused_imports)] extern crate cc; use std::env; use std::path::{Path, PathBuf}; fn assembly( file_vec: &mut Vec, base_dir: &Path, _arch: &str, _is_msvc: bool, ) { #[cfg(target_env = "msvc")] if _is_msvc { let sfx = match _arch { "x86_64" => "x86_64", "aarch64" => "armv8", _ => "unknown", }; let files = glob::glob(&format!("{}/win64/*-{}.asm", base_dir.display(), sfx)) .expect("unable to collect assembly files"); for file in files { file_vec.push(file.unwrap()); } return; } file_vec.push(base_dir.join("assembly.S")); } fn main() { if env::var("CARGO_FEATURE_SERDE_SECRET").is_ok() { println!( "cargo:warning=blst: non-production feature serde-secret enabled" ); } // account for cross-compilation [by examining environment variables] let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap(); let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); let target_family = env::var("CARGO_CFG_TARGET_FAMILY").unwrap_or_default(); let target_no_std = target_os.eq("none") || (target_os.eq("unknown") && target_arch.eq("wasm32")) || target_os.eq("uefi") || env::var("BLST_TEST_NO_STD").is_ok(); if !target_no_std { println!("cargo:rustc-cfg=feature=\"std\""); if target_arch.eq("wasm32") || target_os.eq("unknown") { println!("cargo:rustc-cfg=feature=\"no-threads\""); } } println!("cargo:rerun-if-env-changed=BLST_TEST_NO_STD"); /* * Use pre-built libblst.a if there is one. This is primarily * for trouble-shooting purposes. Idea is that libblst.a can be * compiled with flags independent from cargo defaults, e.g. * '../../build.sh -O1 ...'. */ if Path::new("libblst.a").exists() { println!("cargo:rustc-link-search=."); println!("cargo:rustc-link-lib=blst"); println!("cargo:rerun-if-changed=libblst.a"); return; } let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); let mut blst_base_dir = manifest_dir.join("blst"); if !blst_base_dir.exists() { // Reach out to ../.., which is the root of the blst repo. // Use an absolute path to avoid issues with relative paths // being treated as strings by `cc` and getting concatenated // in ways that reach out of the OUT_DIR. blst_base_dir = manifest_dir .parent() .and_then(|dir| dir.parent()) .expect("can't access parent of parent of current directory") .into(); } println!("Using blst source directory {}", blst_base_dir.display()); // Set CC environment variable to choose alternative C compiler. // Optimization level depends on whether or not --release is passed // or implied. if target_os.eq("uefi") && env::var("CC").is_err() { match std::process::Command::new("clang") .arg("--version") .output() { Ok(_) => env::set_var("CC", "clang"), Err(_) => { /* no clang in sight, just ignore the error */ } } } if target_env.eq("sgx") && env::var("CC").is_err() { match std::process::Command::new("clang") .arg("--version") .output() { Ok(out) => { let version = String::from_utf8(out.stdout) .unwrap_or("unintelligible".to_string()); if let Some(x) = version.find("clang version ") { let x = x + 14; let y = version[x..].find('.').unwrap_or(0); if version[x..x + y].parse::().unwrap_or(0) >= 11 { env::set_var("CC", "clang"); } } } Err(_) => { /* no clang in sight, just ignore the error */ } } } if target_env.eq("msvc") && env::var("CARGO_CFG_TARGET_POINTER_WIDTH").unwrap().eq("32") && env::var("CC").is_err() { match std::process::Command::new("clang-cl") .args(["-m32", "--version"]) .output() { Ok(out) => { if String::from_utf8(out.stdout) .unwrap_or("unintelligible".to_string()) .contains("Target: i386-pc-windows-msvc") { env::set_var("CC", "clang-cl"); } } Err(_) => { /* no clang-cl in sight, just ignore the error */ } } } let mut cc = cc::Build::new(); let c_src_dir = blst_base_dir.join("src"); println!("cargo:rerun-if-changed={}", c_src_dir.display()); let mut file_vec = vec![c_src_dir.join("server.c")]; if target_arch.eq("x86_64") || target_arch.eq("aarch64") { let asm_dir = blst_base_dir.join("build"); println!("cargo:rerun-if-changed={}", asm_dir.display()); assembly( &mut file_vec, &asm_dir, &target_arch, cc.get_compiler().is_like_msvc(), ); } else { cc.define("__BLST_NO_ASM__", None); } match (cfg!(feature = "portable"), cfg!(feature = "force-adx")) { (true, false) => { if target_arch.eq("x86_64") && target_env.eq("sgx") { panic!("'portable' is not supported on SGX target"); } println!("Compiling in portable mode without ISA extensions"); cc.define("__BLST_PORTABLE__", None); } (false, true) => { if target_arch.eq("x86_64") { println!("Enabling ADX support via `force-adx` feature"); cc.define("__ADX__", None); } else { println!("`force-adx` is ignored for non-x86_64 targets"); } } (false, false) => { if target_arch.eq("x86_64") { if target_env.eq("sgx") { println!("Enabling ADX for Intel SGX target"); cc.define("__ADX__", None); } else if env::var("CARGO_ENCODED_RUSTFLAGS") .unwrap_or_default() .contains("target-cpu=") { // If target-cpu is specified on the rustc command line, // then obey the resulting target-features. let feat_list = env::var("CARGO_CFG_TARGET_FEATURE") .unwrap_or_default(); let features: Vec<_> = feat_list.split(',').collect(); if !features.contains(&"ssse3") { println!( "Compiling in portable mode without ISA extensions" ); cc.define("__BLST_PORTABLE__", None); } else if features.contains(&"adx") { println!( "Enabling ADX because it was set as target-feature" ); cc.define("__ADX__", None); } } else { #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("adx") { println!( "Enabling ADX because it was detected on the host" ); cc.define("__ADX__", None); } } } } (true, true) => panic!( "Cannot compile with both `portable` and `force-adx` features" ), } if target_env.eq("msvc") && cc.get_compiler().is_like_msvc() { cc.flag("-Zl"); } cc.flag_if_supported("-mno-avx") // avoid costly transitions .flag_if_supported("-fno-builtin") .flag_if_supported("-Wno-unused-function") .flag_if_supported("-Wno-unused-command-line-argument"); if target_arch.eq("wasm32") || target_family.is_empty() { cc.flag("-ffreestanding"); } if target_arch.eq("wasm32") || target_no_std { cc.define("SCRATCH_LIMIT", "(45 * 1024)"); } if target_env.eq("sgx") { cc.flag_if_supported("-mlvi-hardening"); cc.define("__SGX_LVI_HARDENING__", None); cc.define("__BLST_NO_CPUID__", None); cc.define("__ELF__", None); cc.define("SCRATCH_LIMIT", "(45 * 1024)"); } if !cfg!(debug_assertions) { cc.opt_level(2); } cc.files(&file_vec).compile("blst"); // pass some DEP_BLST_* variables to dependents println!( "cargo:BINDINGS={}", blst_base_dir.join("bindings").to_string_lossy() ); println!("cargo:C_SRC={}", c_src_dir.to_string_lossy()); } ================================================ FILE: bindings/rust/publish.sh ================================================ #!/bin/sh HERE=`dirname $0` cd "${HERE}" if [ ! -d blst ]; then trap '[ -h blst ] && rm -f blst' 0 2 ln -s ../.. blst fi # --allow-dirty because the temporary blst symbolic link is not committed cargo +stable publish --allow-dirty "$@" ================================================ FILE: bindings/rust/rustfmt.toml ================================================ max_width = 80 ================================================ FILE: bindings/rust/src/bindings.rs ================================================ /* automatically generated by rust-bindgen 0.65.1 */ #[repr(u32)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum BLST_ERROR { BLST_SUCCESS = 0, BLST_BAD_ENCODING = 1, BLST_POINT_NOT_ON_CURVE = 2, BLST_POINT_NOT_IN_GROUP = 3, BLST_AGGR_TYPE_MISMATCH = 4, BLST_VERIFY_FAIL = 5, BLST_PK_IS_INFINITY = 6, BLST_BAD_SCALAR = 7, } pub type byte = u8; pub type limb_t = u64; #[repr(C)] #[derive(Debug, Default, Clone, PartialEq, Eq, Zeroize)] #[zeroize(drop)] pub struct blst_scalar { pub b: [byte; 32usize], } #[test] fn bindgen_test_layout_blst_scalar() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 32usize, concat!("Size of: ", stringify!(blst_scalar)) ); assert_eq!( ::core::mem::align_of::(), 1usize, concat!("Alignment of ", stringify!(blst_scalar)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).b) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_scalar), "::", stringify!(b) ) ); } #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] pub struct blst_fr { pub l: [limb_t; 4usize], } #[test] fn bindgen_test_layout_blst_fr() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 32usize, concat!("Size of: ", stringify!(blst_fr)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_fr)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).l) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_fr), "::", stringify!(l) ) ); } #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] pub struct blst_fp { pub l: [limb_t; 6usize], } #[test] fn bindgen_test_layout_blst_fp() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 48usize, concat!("Size of: ", stringify!(blst_fp)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_fp)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).l) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_fp), "::", stringify!(l) ) ); } #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] pub struct blst_fp2 { pub fp: [blst_fp; 2usize], } #[test] fn bindgen_test_layout_blst_fp2() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 96usize, concat!("Size of: ", stringify!(blst_fp2)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_fp2)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).fp) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_fp2), "::", stringify!(fp) ) ); } #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] pub struct blst_fp6 { pub fp2: [blst_fp2; 3usize], } #[test] fn bindgen_test_layout_blst_fp6() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 288usize, concat!("Size of: ", stringify!(blst_fp6)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_fp6)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).fp2) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_fp6), "::", stringify!(fp2) ) ); } #[repr(C)] #[derive(Debug, Copy, Clone, Eq)] pub struct blst_fp12 { pub fp6: [blst_fp6; 2usize], } #[test] fn bindgen_test_layout_blst_fp12() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 576usize, concat!("Size of: ", stringify!(blst_fp12)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_fp12)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).fp6) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_fp12), "::", stringify!(fp6) ) ); } extern "C" { pub fn blst_scalar_from_uint32(out: *mut blst_scalar, a: *const u32); } extern "C" { pub fn blst_uint32_from_scalar(out: *mut u32, a: *const blst_scalar); } extern "C" { pub fn blst_scalar_from_uint64(out: *mut blst_scalar, a: *const u64); } extern "C" { pub fn blst_uint64_from_scalar(out: *mut u64, a: *const blst_scalar); } extern "C" { pub fn blst_scalar_from_bendian(out: *mut blst_scalar, a: *const byte); } extern "C" { pub fn blst_bendian_from_scalar(out: *mut byte, a: *const blst_scalar); } extern "C" { pub fn blst_scalar_from_lendian(out: *mut blst_scalar, a: *const byte); } extern "C" { pub fn blst_lendian_from_scalar(out: *mut byte, a: *const blst_scalar); } extern "C" { pub fn blst_scalar_fr_check(a: *const blst_scalar) -> bool; } extern "C" { pub fn blst_sk_check(a: *const blst_scalar) -> bool; } extern "C" { pub fn blst_sk_add_n_check( out: *mut blst_scalar, a: *const blst_scalar, b: *const blst_scalar, ) -> bool; } extern "C" { pub fn blst_sk_sub_n_check( out: *mut blst_scalar, a: *const blst_scalar, b: *const blst_scalar, ) -> bool; } extern "C" { pub fn blst_sk_mul_n_check( out: *mut blst_scalar, a: *const blst_scalar, b: *const blst_scalar, ) -> bool; } extern "C" { pub fn blst_sk_inverse(out: *mut blst_scalar, a: *const blst_scalar); } extern "C" { pub fn blst_scalar_from_le_bytes(out: *mut blst_scalar, in_: *const byte, len: usize) -> bool; } extern "C" { pub fn blst_scalar_from_be_bytes(out: *mut blst_scalar, in_: *const byte, len: usize) -> bool; } extern "C" { pub fn blst_fr_add(ret: *mut blst_fr, a: *const blst_fr, b: *const blst_fr); } extern "C" { pub fn blst_fr_sub(ret: *mut blst_fr, a: *const blst_fr, b: *const blst_fr); } extern "C" { pub fn blst_fr_mul_by_3(ret: *mut blst_fr, a: *const blst_fr); } extern "C" { pub fn blst_fr_lshift(ret: *mut blst_fr, a: *const blst_fr, count: usize); } extern "C" { pub fn blst_fr_rshift(ret: *mut blst_fr, a: *const blst_fr, count: usize); } extern "C" { pub fn blst_fr_mul(ret: *mut blst_fr, a: *const blst_fr, b: *const blst_fr); } extern "C" { pub fn blst_fr_sqr(ret: *mut blst_fr, a: *const blst_fr); } extern "C" { pub fn blst_fr_cneg(ret: *mut blst_fr, a: *const blst_fr, flag: bool); } extern "C" { pub fn blst_fr_eucl_inverse(ret: *mut blst_fr, a: *const blst_fr); } extern "C" { pub fn blst_fr_inverse(ret: *mut blst_fr, a: *const blst_fr); } extern "C" { pub fn blst_fr_from_uint64(ret: *mut blst_fr, a: *const u64); } extern "C" { pub fn blst_uint64_from_fr(ret: *mut u64, a: *const blst_fr); } extern "C" { pub fn blst_fr_from_scalar(ret: *mut blst_fr, a: *const blst_scalar); } extern "C" { pub fn blst_scalar_from_fr(ret: *mut blst_scalar, a: *const blst_fr); } extern "C" { pub fn blst_fp_add(ret: *mut blst_fp, a: *const blst_fp, b: *const blst_fp); } extern "C" { pub fn blst_fp_sub(ret: *mut blst_fp, a: *const blst_fp, b: *const blst_fp); } extern "C" { pub fn blst_fp_mul_by_3(ret: *mut blst_fp, a: *const blst_fp); } extern "C" { pub fn blst_fp_mul_by_8(ret: *mut blst_fp, a: *const blst_fp); } extern "C" { pub fn blst_fp_lshift(ret: *mut blst_fp, a: *const blst_fp, count: usize); } extern "C" { pub fn blst_fp_mul(ret: *mut blst_fp, a: *const blst_fp, b: *const blst_fp); } extern "C" { pub fn blst_fp_sqr(ret: *mut blst_fp, a: *const blst_fp); } extern "C" { pub fn blst_fp_cneg(ret: *mut blst_fp, a: *const blst_fp, flag: bool); } extern "C" { pub fn blst_fp_eucl_inverse(ret: *mut blst_fp, a: *const blst_fp); } extern "C" { pub fn blst_fp_inverse(ret: *mut blst_fp, a: *const blst_fp); } extern "C" { pub fn blst_fp_sqrt(ret: *mut blst_fp, a: *const blst_fp) -> bool; } extern "C" { pub fn blst_fp_from_uint32(ret: *mut blst_fp, a: *const u32); } extern "C" { pub fn blst_uint32_from_fp(ret: *mut u32, a: *const blst_fp); } extern "C" { pub fn blst_fp_from_uint64(ret: *mut blst_fp, a: *const u64); } extern "C" { pub fn blst_uint64_from_fp(ret: *mut u64, a: *const blst_fp); } extern "C" { pub fn blst_fp_from_bendian(ret: *mut blst_fp, a: *const byte); } extern "C" { pub fn blst_bendian_from_fp(ret: *mut byte, a: *const blst_fp); } extern "C" { pub fn blst_fp_from_lendian(ret: *mut blst_fp, a: *const byte); } extern "C" { pub fn blst_lendian_from_fp(ret: *mut byte, a: *const blst_fp); } extern "C" { pub fn blst_fp2_add(ret: *mut blst_fp2, a: *const blst_fp2, b: *const blst_fp2); } extern "C" { pub fn blst_fp2_sub(ret: *mut blst_fp2, a: *const blst_fp2, b: *const blst_fp2); } extern "C" { pub fn blst_fp2_mul_by_3(ret: *mut blst_fp2, a: *const blst_fp2); } extern "C" { pub fn blst_fp2_mul_by_8(ret: *mut blst_fp2, a: *const blst_fp2); } extern "C" { pub fn blst_fp2_lshift(ret: *mut blst_fp2, a: *const blst_fp2, count: usize); } extern "C" { pub fn blst_fp2_mul(ret: *mut blst_fp2, a: *const blst_fp2, b: *const blst_fp2); } extern "C" { pub fn blst_fp2_sqr(ret: *mut blst_fp2, a: *const blst_fp2); } extern "C" { pub fn blst_fp2_cneg(ret: *mut blst_fp2, a: *const blst_fp2, flag: bool); } extern "C" { pub fn blst_fp2_eucl_inverse(ret: *mut blst_fp2, a: *const blst_fp2); } extern "C" { pub fn blst_fp2_inverse(ret: *mut blst_fp2, a: *const blst_fp2); } extern "C" { pub fn blst_fp2_sqrt(ret: *mut blst_fp2, a: *const blst_fp2) -> bool; } extern "C" { pub fn blst_fp12_sqr(ret: *mut blst_fp12, a: *const blst_fp12); } extern "C" { pub fn blst_fp12_cyclotomic_sqr(ret: *mut blst_fp12, a: *const blst_fp12); } extern "C" { pub fn blst_fp12_mul(ret: *mut blst_fp12, a: *const blst_fp12, b: *const blst_fp12); } extern "C" { pub fn blst_fp12_mul_by_xy00z0( ret: *mut blst_fp12, a: *const blst_fp12, xy00z0: *const blst_fp6, ); } extern "C" { pub fn blst_fp12_conjugate(a: *mut blst_fp12); } extern "C" { pub fn blst_fp12_inverse(ret: *mut blst_fp12, a: *const blst_fp12); } extern "C" { pub fn blst_fp12_frobenius_map(ret: *mut blst_fp12, a: *const blst_fp12, n: usize); } extern "C" { pub fn blst_fp12_is_equal(a: *const blst_fp12, b: *const blst_fp12) -> bool; } extern "C" { pub fn blst_fp12_is_one(a: *const blst_fp12) -> bool; } extern "C" { pub fn blst_fp12_in_group(a: *const blst_fp12) -> bool; } extern "C" { pub fn blst_fp12_one() -> *const blst_fp12; } #[repr(C)] #[derive(Debug, Default, Copy, Clone, Eq)] pub struct blst_p1 { pub x: blst_fp, pub y: blst_fp, pub z: blst_fp, } #[test] fn bindgen_test_layout_blst_p1() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 144usize, concat!("Size of: ", stringify!(blst_p1)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_p1)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_p1), "::", stringify!(x) ) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, 48usize, concat!( "Offset of field: ", stringify!(blst_p1), "::", stringify!(y) ) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).z) as usize - ptr as usize }, 96usize, concat!( "Offset of field: ", stringify!(blst_p1), "::", stringify!(z) ) ); } #[repr(C)] #[derive(Debug, Default, Copy, Clone, Eq)] pub struct blst_p1_affine { pub x: blst_fp, pub y: blst_fp, } #[test] fn bindgen_test_layout_blst_p1_affine() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 96usize, concat!("Size of: ", stringify!(blst_p1_affine)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_p1_affine)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_p1_affine), "::", stringify!(x) ) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, 48usize, concat!( "Offset of field: ", stringify!(blst_p1_affine), "::", stringify!(y) ) ); } extern "C" { pub fn blst_p1_add(out: *mut blst_p1, a: *const blst_p1, b: *const blst_p1); } extern "C" { pub fn blst_p1_add_or_double(out: *mut blst_p1, a: *const blst_p1, b: *const blst_p1); } extern "C" { pub fn blst_p1_add_affine(out: *mut blst_p1, a: *const blst_p1, b: *const blst_p1_affine); } extern "C" { pub fn blst_p1_add_or_double_affine( out: *mut blst_p1, a: *const blst_p1, b: *const blst_p1_affine, ); } extern "C" { pub fn blst_p1_double(out: *mut blst_p1, a: *const blst_p1); } extern "C" { pub fn blst_p1_mult(out: *mut blst_p1, p: *const blst_p1, scalar: *const byte, nbits: usize); } extern "C" { pub fn blst_p1_cneg(p: *mut blst_p1, cbit: bool); } extern "C" { pub fn blst_p1_to_affine(out: *mut blst_p1_affine, in_: *const blst_p1); } extern "C" { pub fn blst_p1_from_affine(out: *mut blst_p1, in_: *const blst_p1_affine); } extern "C" { pub fn blst_p1_on_curve(p: *const blst_p1) -> bool; } extern "C" { pub fn blst_p1_in_g1(p: *const blst_p1) -> bool; } extern "C" { pub fn blst_p1_is_equal(a: *const blst_p1, b: *const blst_p1) -> bool; } extern "C" { pub fn blst_p1_is_inf(a: *const blst_p1) -> bool; } extern "C" { pub fn blst_p1_generator() -> *const blst_p1; } extern "C" { pub fn blst_p1_affine_on_curve(p: *const blst_p1_affine) -> bool; } extern "C" { pub fn blst_p1_affine_in_g1(p: *const blst_p1_affine) -> bool; } extern "C" { pub fn blst_p1_affine_is_equal(a: *const blst_p1_affine, b: *const blst_p1_affine) -> bool; } extern "C" { pub fn blst_p1_affine_is_inf(a: *const blst_p1_affine) -> bool; } extern "C" { pub fn blst_p1_affine_generator() -> *const blst_p1_affine; } #[repr(C)] #[derive(Debug, Default, Copy, Clone, Eq)] pub struct blst_p2 { pub x: blst_fp2, pub y: blst_fp2, pub z: blst_fp2, } #[test] fn bindgen_test_layout_blst_p2() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 288usize, concat!("Size of: ", stringify!(blst_p2)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_p2)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_p2), "::", stringify!(x) ) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, 96usize, concat!( "Offset of field: ", stringify!(blst_p2), "::", stringify!(y) ) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).z) as usize - ptr as usize }, 192usize, concat!( "Offset of field: ", stringify!(blst_p2), "::", stringify!(z) ) ); } #[repr(C)] #[derive(Debug, Default, Copy, Clone, Eq)] pub struct blst_p2_affine { pub x: blst_fp2, pub y: blst_fp2, } #[test] fn bindgen_test_layout_blst_p2_affine() { const UNINIT: ::core::mem::MaybeUninit = ::core::mem::MaybeUninit::uninit(); let ptr = UNINIT.as_ptr(); assert_eq!( ::core::mem::size_of::(), 192usize, concat!("Size of: ", stringify!(blst_p2_affine)) ); assert_eq!( ::core::mem::align_of::(), 8usize, concat!("Alignment of ", stringify!(blst_p2_affine)) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).x) as usize - ptr as usize }, 0usize, concat!( "Offset of field: ", stringify!(blst_p2_affine), "::", stringify!(x) ) ); assert_eq!( unsafe { ::core::ptr::addr_of!((*ptr).y) as usize - ptr as usize }, 96usize, concat!( "Offset of field: ", stringify!(blst_p2_affine), "::", stringify!(y) ) ); } extern "C" { pub fn blst_p2_add(out: *mut blst_p2, a: *const blst_p2, b: *const blst_p2); } extern "C" { pub fn blst_p2_add_or_double(out: *mut blst_p2, a: *const blst_p2, b: *const blst_p2); } extern "C" { pub fn blst_p2_add_affine(out: *mut blst_p2, a: *const blst_p2, b: *const blst_p2_affine); } extern "C" { pub fn blst_p2_add_or_double_affine( out: *mut blst_p2, a: *const blst_p2, b: *const blst_p2_affine, ); } extern "C" { pub fn blst_p2_double(out: *mut blst_p2, a: *const blst_p2); } extern "C" { pub fn blst_p2_mult(out: *mut blst_p2, p: *const blst_p2, scalar: *const byte, nbits: usize); } extern "C" { pub fn blst_p2_cneg(p: *mut blst_p2, cbit: bool); } extern "C" { pub fn blst_p2_to_affine(out: *mut blst_p2_affine, in_: *const blst_p2); } extern "C" { pub fn blst_p2_from_affine(out: *mut blst_p2, in_: *const blst_p2_affine); } extern "C" { pub fn blst_p2_on_curve(p: *const blst_p2) -> bool; } extern "C" { pub fn blst_p2_in_g2(p: *const blst_p2) -> bool; } extern "C" { pub fn blst_p2_is_equal(a: *const blst_p2, b: *const blst_p2) -> bool; } extern "C" { pub fn blst_p2_is_inf(a: *const blst_p2) -> bool; } extern "C" { pub fn blst_p2_generator() -> *const blst_p2; } extern "C" { pub fn blst_p2_affine_on_curve(p: *const blst_p2_affine) -> bool; } extern "C" { pub fn blst_p2_affine_in_g2(p: *const blst_p2_affine) -> bool; } extern "C" { pub fn blst_p2_affine_is_equal(a: *const blst_p2_affine, b: *const blst_p2_affine) -> bool; } extern "C" { pub fn blst_p2_affine_is_inf(a: *const blst_p2_affine) -> bool; } extern "C" { pub fn blst_p2_affine_generator() -> *const blst_p2_affine; } extern "C" { pub fn blst_p1s_to_affine( dst: *mut blst_p1_affine, points: *const *const blst_p1, npoints: usize, ); } extern "C" { pub fn blst_p1s_add(ret: *mut blst_p1, points: *const *const blst_p1_affine, npoints: usize); } extern "C" { pub fn blst_p1s_mult_wbits_precompute_sizeof(wbits: usize, npoints: usize) -> usize; } extern "C" { pub fn blst_p1s_mult_wbits_precompute( table: *mut blst_p1_affine, wbits: usize, points: *const *const blst_p1_affine, npoints: usize, ); } extern "C" { pub fn blst_p1s_mult_wbits_scratch_sizeof(npoints: usize) -> usize; } extern "C" { pub fn blst_p1s_mult_wbits( ret: *mut blst_p1, table: *const blst_p1_affine, wbits: usize, npoints: usize, scalars: *const *const byte, nbits: usize, scratch: *mut limb_t, ); } extern "C" { pub fn blst_p1s_mult_pippenger_scratch_sizeof(npoints: usize) -> usize; } extern "C" { pub fn blst_p1s_mult_pippenger( ret: *mut blst_p1, points: *const *const blst_p1_affine, npoints: usize, scalars: *const *const byte, nbits: usize, scratch: *mut limb_t, ); } extern "C" { pub fn blst_p1s_tile_pippenger( ret: *mut blst_p1, points: *const *const blst_p1_affine, npoints: usize, scalars: *const *const byte, nbits: usize, scratch: *mut limb_t, bit0: usize, window: usize, ); } extern "C" { pub fn blst_p2s_to_affine( dst: *mut blst_p2_affine, points: *const *const blst_p2, npoints: usize, ); } extern "C" { pub fn blst_p2s_add(ret: *mut blst_p2, points: *const *const blst_p2_affine, npoints: usize); } extern "C" { pub fn blst_p2s_mult_wbits_precompute_sizeof(wbits: usize, npoints: usize) -> usize; } extern "C" { pub fn blst_p2s_mult_wbits_precompute( table: *mut blst_p2_affine, wbits: usize, points: *const *const blst_p2_affine, npoints: usize, ); } extern "C" { pub fn blst_p2s_mult_wbits_scratch_sizeof(npoints: usize) -> usize; } extern "C" { pub fn blst_p2s_mult_wbits( ret: *mut blst_p2, table: *const blst_p2_affine, wbits: usize, npoints: usize, scalars: *const *const byte, nbits: usize, scratch: *mut limb_t, ); } extern "C" { pub fn blst_p2s_mult_pippenger_scratch_sizeof(npoints: usize) -> usize; } extern "C" { pub fn blst_p2s_mult_pippenger( ret: *mut blst_p2, points: *const *const blst_p2_affine, npoints: usize, scalars: *const *const byte, nbits: usize, scratch: *mut limb_t, ); } extern "C" { pub fn blst_p2s_tile_pippenger( ret: *mut blst_p2, points: *const *const blst_p2_affine, npoints: usize, scalars: *const *const byte, nbits: usize, scratch: *mut limb_t, bit0: usize, window: usize, ); } extern "C" { pub fn blst_map_to_g1(out: *mut blst_p1, u: *const blst_fp, v: *const blst_fp); } extern "C" { pub fn blst_map_to_g2(out: *mut blst_p2, u: *const blst_fp2, v: *const blst_fp2); } extern "C" { pub fn blst_encode_to_g1( out: *mut blst_p1, msg: *const byte, msg_len: usize, DST: *const byte, DST_len: usize, aug: *const byte, aug_len: usize, ); } extern "C" { pub fn blst_hash_to_g1( out: *mut blst_p1, msg: *const byte, msg_len: usize, DST: *const byte, DST_len: usize, aug: *const byte, aug_len: usize, ); } extern "C" { pub fn blst_encode_to_g2( out: *mut blst_p2, msg: *const byte, msg_len: usize, DST: *const byte, DST_len: usize, aug: *const byte, aug_len: usize, ); } extern "C" { pub fn blst_hash_to_g2( out: *mut blst_p2, msg: *const byte, msg_len: usize, DST: *const byte, DST_len: usize, aug: *const byte, aug_len: usize, ); } extern "C" { pub fn blst_p1_serialize(out: *mut byte, in_: *const blst_p1); } extern "C" { pub fn blst_p1_compress(out: *mut byte, in_: *const blst_p1); } extern "C" { pub fn blst_p1_affine_serialize(out: *mut byte, in_: *const blst_p1_affine); } extern "C" { pub fn blst_p1_affine_compress(out: *mut byte, in_: *const blst_p1_affine); } extern "C" { pub fn blst_p1_uncompress(out: *mut blst_p1_affine, in_: *const byte) -> BLST_ERROR; } extern "C" { pub fn blst_p1_deserialize(out: *mut blst_p1_affine, in_: *const byte) -> BLST_ERROR; } extern "C" { pub fn blst_p2_serialize(out: *mut byte, in_: *const blst_p2); } extern "C" { pub fn blst_p2_compress(out: *mut byte, in_: *const blst_p2); } extern "C" { pub fn blst_p2_affine_serialize(out: *mut byte, in_: *const blst_p2_affine); } extern "C" { pub fn blst_p2_affine_compress(out: *mut byte, in_: *const blst_p2_affine); } extern "C" { pub fn blst_p2_uncompress(out: *mut blst_p2_affine, in_: *const byte) -> BLST_ERROR; } extern "C" { pub fn blst_p2_deserialize(out: *mut blst_p2_affine, in_: *const byte) -> BLST_ERROR; } extern "C" { pub fn blst_keygen( out_SK: *mut blst_scalar, IKM: *const byte, IKM_len: usize, info: *const byte, info_len: usize, ); } extern "C" { pub fn blst_sk_to_pk_in_g1(out_pk: *mut blst_p1, SK: *const blst_scalar); } extern "C" { pub fn blst_sign_pk_in_g1(out_sig: *mut blst_p2, hash: *const blst_p2, SK: *const blst_scalar); } extern "C" { pub fn blst_sk_to_pk_in_g2(out_pk: *mut blst_p2, SK: *const blst_scalar); } extern "C" { pub fn blst_sign_pk_in_g2(out_sig: *mut blst_p1, hash: *const blst_p1, SK: *const blst_scalar); } extern "C" { pub fn blst_miller_loop( ret: *mut blst_fp12, Q: *const blst_p2_affine, P: *const blst_p1_affine, ); } extern "C" { pub fn blst_miller_loop_n( ret: *mut blst_fp12, Qs: *const *const blst_p2_affine, Ps: *const *const blst_p1_affine, n: usize, ); } extern "C" { pub fn blst_final_exp(ret: *mut blst_fp12, f: *const blst_fp12); } extern "C" { pub fn blst_precompute_lines(Qlines: *mut blst_fp6, Q: *const blst_p2_affine); } extern "C" { pub fn blst_miller_loop_lines( ret: *mut blst_fp12, Qlines: *const blst_fp6, P: *const blst_p1_affine, ); } extern "C" { pub fn blst_fp12_finalverify(gt1: *const blst_fp12, gt2: *const blst_fp12) -> bool; } #[repr(C)] #[repr(align(1))] #[derive(Debug, Default)] pub struct blst_pairing { pub _bindgen_opaque_blob: [u8; 0usize], } #[test] fn bindgen_test_layout_blst_pairing() { assert_eq!( ::core::mem::size_of::(), 0usize, concat!("Size of: ", stringify!(blst_pairing)) ); assert_eq!( ::core::mem::align_of::(), 1usize, concat!("Alignment of ", stringify!(blst_pairing)) ); } extern "C" { pub fn blst_pairing_sizeof() -> usize; } extern "C" { pub fn blst_pairing_init( new_ctx: *mut blst_pairing, hash_or_encode: bool, DST: *const byte, DST_len: usize, ); } extern "C" { pub fn blst_pairing_get_dst(ctx: *const blst_pairing) -> *const byte; } extern "C" { pub fn blst_pairing_commit(ctx: *mut blst_pairing); } extern "C" { pub fn blst_pairing_aggregate_pk_in_g2( ctx: *mut blst_pairing, PK: *const blst_p2_affine, signature: *const blst_p1_affine, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_chk_n_aggr_pk_in_g2( ctx: *mut blst_pairing, PK: *const blst_p2_affine, pk_grpchk: bool, signature: *const blst_p1_affine, sig_grpchk: bool, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_mul_n_aggregate_pk_in_g2( ctx: *mut blst_pairing, PK: *const blst_p2_affine, sig: *const blst_p1_affine, scalar: *const byte, nbits: usize, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_chk_n_mul_n_aggr_pk_in_g2( ctx: *mut blst_pairing, PK: *const blst_p2_affine, pk_grpchk: bool, sig: *const blst_p1_affine, sig_grpchk: bool, scalar: *const byte, nbits: usize, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_aggregate_pk_in_g1( ctx: *mut blst_pairing, PK: *const blst_p1_affine, signature: *const blst_p2_affine, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_chk_n_aggr_pk_in_g1( ctx: *mut blst_pairing, PK: *const blst_p1_affine, pk_grpchk: bool, signature: *const blst_p2_affine, sig_grpchk: bool, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_mul_n_aggregate_pk_in_g1( ctx: *mut blst_pairing, PK: *const blst_p1_affine, sig: *const blst_p2_affine, scalar: *const byte, nbits: usize, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_chk_n_mul_n_aggr_pk_in_g1( ctx: *mut blst_pairing, PK: *const blst_p1_affine, pk_grpchk: bool, sig: *const blst_p2_affine, sig_grpchk: bool, scalar: *const byte, nbits: usize, msg: *const byte, msg_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_merge(ctx: *mut blst_pairing, ctx1: *const blst_pairing) -> BLST_ERROR; } extern "C" { pub fn blst_pairing_finalverify(ctx: *const blst_pairing, gtsig: *const blst_fp12) -> bool; } extern "C" { pub fn blst_aggregate_in_g1( out: *mut blst_p1, in_: *const blst_p1, zwire: *const byte, ) -> BLST_ERROR; } extern "C" { pub fn blst_aggregate_in_g2( out: *mut blst_p2, in_: *const blst_p2, zwire: *const byte, ) -> BLST_ERROR; } extern "C" { pub fn blst_aggregated_in_g1(out: *mut blst_fp12, signature: *const blst_p1_affine); } extern "C" { pub fn blst_aggregated_in_g2(out: *mut blst_fp12, signature: *const blst_p2_affine); } extern "C" { pub fn blst_core_verify_pk_in_g1( pk: *const blst_p1_affine, signature: *const blst_p2_affine, hash_or_encode: bool, msg: *const byte, msg_len: usize, DST: *const byte, DST_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub fn blst_core_verify_pk_in_g2( pk: *const blst_p2_affine, signature: *const blst_p1_affine, hash_or_encode: bool, msg: *const byte, msg_len: usize, DST: *const byte, DST_len: usize, aug: *const byte, aug_len: usize, ) -> BLST_ERROR; } extern "C" { pub static BLS12_381_G1: blst_p1_affine; } extern "C" { pub static BLS12_381_NEG_G1: blst_p1_affine; } extern "C" { pub static BLS12_381_G2: blst_p2_affine; } extern "C" { pub static BLS12_381_NEG_G2: blst_p2_affine; } extern "C" { pub fn blst_fr_ct_bfly(x0: *mut blst_fr, x1: *mut blst_fr, twiddle: *const blst_fr); } extern "C" { pub fn blst_fr_gs_bfly(x0: *mut blst_fr, x1: *mut blst_fr, twiddle: *const blst_fr); } extern "C" { pub fn blst_fr_to(ret: *mut blst_fr, a: *const blst_fr); } extern "C" { pub fn blst_fr_from(ret: *mut blst_fr, a: *const blst_fr); } extern "C" { pub fn blst_fp_to(ret: *mut blst_fp, a: *const blst_fp); } extern "C" { pub fn blst_fp_from(ret: *mut blst_fp, a: *const blst_fp); } extern "C" { pub fn blst_fp_is_square(a: *const blst_fp) -> bool; } extern "C" { pub fn blst_fp2_is_square(a: *const blst_fp2) -> bool; } extern "C" { pub fn blst_p1_from_jacobian(out: *mut blst_p1, in_: *const blst_p1); } extern "C" { pub fn blst_p2_from_jacobian(out: *mut blst_p2, in_: *const blst_p2); } extern "C" { pub fn blst_sk_to_pk2_in_g1( out: *mut byte, out_pk: *mut blst_p1_affine, SK: *const blst_scalar, ); } extern "C" { pub fn blst_sign_pk2_in_g1( out: *mut byte, out_sig: *mut blst_p2_affine, hash: *const blst_p2, SK: *const blst_scalar, ); } extern "C" { pub fn blst_sk_to_pk2_in_g2( out: *mut byte, out_pk: *mut blst_p2_affine, SK: *const blst_scalar, ); } extern "C" { pub fn blst_sign_pk2_in_g2( out: *mut byte, out_sig: *mut blst_p1_affine, hash: *const blst_p1, SK: *const blst_scalar, ); } #[repr(C)] #[repr(align(1))] #[derive(Debug, Default)] pub struct blst_uniq { pub _bindgen_opaque_blob: [u8; 0usize], } #[test] fn bindgen_test_layout_blst_uniq() { assert_eq!( ::core::mem::size_of::(), 0usize, concat!("Size of: ", stringify!(blst_uniq)) ); assert_eq!( ::core::mem::align_of::(), 1usize, concat!("Alignment of ", stringify!(blst_uniq)) ); } extern "C" { pub fn blst_uniq_sizeof(n_nodes: usize) -> usize; } extern "C" { pub fn blst_uniq_init(tree: *mut blst_uniq); } extern "C" { pub fn blst_uniq_test(tree: *mut blst_uniq, msg: *const byte, len: usize) -> bool; } extern "C" { pub fn blst_expand_message_xmd( out: *mut byte, out_len: usize, msg: *const byte, msg_len: usize, DST: *const byte, DST_len: usize, ); } extern "C" { pub fn blst_p1_unchecked_mult( out: *mut blst_p1, p: *const blst_p1, scalar: *const byte, nbits: usize, ); } extern "C" { pub fn blst_p2_unchecked_mult( out: *mut blst_p2, p: *const blst_p2, scalar: *const byte, nbits: usize, ); } extern "C" { pub fn blst_pairing_raw_aggregate( ctx: *mut blst_pairing, q: *const blst_p2_affine, p: *const blst_p1_affine, ); } extern "C" { pub fn blst_pairing_as_fp12(ctx: *mut blst_pairing) -> *mut blst_fp12; } extern "C" { pub fn blst_bendian_from_fp12(out: *mut byte, a: *const blst_fp12); } extern "C" { pub fn blst_keygen_v3( out_SK: *mut blst_scalar, IKM: *const byte, IKM_len: usize, info: *const byte, info_len: usize, ); } extern "C" { pub fn blst_keygen_v4_5( out_SK: *mut blst_scalar, IKM: *const byte, IKM_len: usize, salt: *const byte, salt_len: usize, info: *const byte, info_len: usize, ); } extern "C" { pub fn blst_keygen_v5( out_SK: *mut blst_scalar, IKM: *const byte, IKM_len: usize, salt: *const byte, salt_len: usize, info: *const byte, info_len: usize, ); } extern "C" { pub fn blst_derive_master_eip2333(out_SK: *mut blst_scalar, IKM: *const byte, IKM_len: usize); } extern "C" { pub fn blst_derive_child_eip2333( out_SK: *mut blst_scalar, SK: *const blst_scalar, child_index: u32, ); } extern "C" { pub fn blst_scalar_from_hexascii(out: *mut blst_scalar, hex: *const byte); } extern "C" { pub fn blst_fr_from_hexascii(ret: *mut blst_fr, hex: *const byte); } extern "C" { pub fn blst_fp_from_hexascii(ret: *mut blst_fp, hex: *const byte); } extern "C" { pub fn blst_p1_sizeof() -> usize; } extern "C" { pub fn blst_p1_affine_sizeof() -> usize; } extern "C" { pub fn blst_p2_sizeof() -> usize; } extern "C" { pub fn blst_p2_affine_sizeof() -> usize; } extern "C" { pub fn blst_fp12_sizeof() -> usize; } extern "C" { pub fn blst_fp_from_le_bytes(ret: *mut blst_fp, in_: *const byte, len: usize); } extern "C" { pub fn blst_fp_from_be_bytes(ret: *mut blst_fp, in_: *const byte, len: usize); } extern "C" { pub fn blst_sha256(out: *mut byte, msg: *const byte, msg_len: usize); } #[test] fn bindgen_test_normal_types() { // from "Rust for Rustaceans" by Jon Gjengset fn is_normal() {} is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); is_normal::(); } ================================================ FILE: bindings/rust/src/lib.rs ================================================ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 #![cfg_attr(not(feature = "std"), no_std)] #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] #![allow(unexpected_cfgs)] extern crate alloc; use alloc::boxed::Box; use alloc::vec; use alloc::vec::Vec; use core::any::Any; use core::mem::{transmute, MaybeUninit}; use core::ptr; use zeroize::Zeroize; #[cfg(feature = "std")] use std::sync::{atomic::*, mpsc::sync_channel, Arc}; #[cfg(feature = "serde")] use serde::{Deserialize, Deserializer, Serialize, Serializer}; #[cfg(feature = "std")] trait ThreadPoolExt { fn joined_execute<'any, F>(&self, job: F) where F: FnOnce() + Send + 'any; } #[cfg(all(not(feature = "no-threads"), feature = "std"))] mod mt { use super::*; use std::sync::{Mutex, Once}; use threadpool::ThreadPool; pub fn da_pool() -> ThreadPool { static INIT: Once = Once::new(); static mut POOL: *const Mutex = ptr::null(); INIT.call_once(|| { let pool = Mutex::new(ThreadPool::default()); unsafe { POOL = transmute::, *const _>(Box::new(pool)) }; }); unsafe { (*POOL).lock().unwrap().clone() } } type Thunk<'any> = Box; impl ThreadPoolExt for ThreadPool { fn joined_execute<'scope, F>(&self, job: F) where F: FnOnce() + Send + 'scope, { // Bypass 'lifetime limitations by brute force. It works, // because we explicitly join the threads... self.execute(unsafe { transmute::, Thunk<'static>>(Box::new(job)) }) } } } #[cfg(all(feature = "no-threads", feature = "std"))] mod mt { use super::*; pub struct EmptyPool {} pub fn da_pool() -> EmptyPool { EmptyPool {} } impl EmptyPool { pub fn max_count(&self) -> usize { 1 } } impl ThreadPoolExt for EmptyPool { fn joined_execute<'scope, F>(&self, job: F) where F: FnOnce() + Send + 'scope, { job() } } } include!("bindings.rs"); impl PartialEq for blst_p1 { fn eq(&self, other: &Self) -> bool { unsafe { blst_p1_is_equal(self, other) } } } impl PartialEq for blst_p1_affine { fn eq(&self, other: &Self) -> bool { unsafe { blst_p1_affine_is_equal(self, other) } } } impl PartialEq for blst_p2 { fn eq(&self, other: &Self) -> bool { unsafe { blst_p2_is_equal(self, other) } } } impl PartialEq for blst_p2_affine { fn eq(&self, other: &Self) -> bool { unsafe { blst_p2_affine_is_equal(self, other) } } } impl Default for blst_fp12 { fn default() -> Self { unsafe { *blst_fp12_one() } } } impl PartialEq for blst_fp12 { fn eq(&self, other: &Self) -> bool { unsafe { blst_fp12_is_equal(self, other) } } } impl core::ops::Mul for blst_fp12 { type Output = Self; fn mul(self, other: Self) -> Self { let mut out = MaybeUninit::::uninit(); unsafe { blst_fp12_mul(out.as_mut_ptr(), &self, &other); out.assume_init() } } } impl core::ops::MulAssign for blst_fp12 { fn mul_assign(&mut self, other: Self) { unsafe { blst_fp12_mul(self, self, &other) } } } impl blst_fp12 { pub fn miller_loop(q: &blst_p2_affine, p: &blst_p1_affine) -> Self { let mut out = MaybeUninit::::uninit(); unsafe { blst_miller_loop(out.as_mut_ptr(), q, p); out.assume_init() } } #[cfg(not(feature = "std"))] pub fn miller_loop_n(q: &[blst_p2_affine], p: &[blst_p1_affine]) -> Self { let n_elems = q.len(); if n_elems != p.len() || n_elems == 0 { panic!("inputs' lengths mismatch"); } let qs: [*const _; 2] = [&q[0], ptr::null()]; let ps: [*const _; 2] = [&p[0], ptr::null()]; let mut out = MaybeUninit::::uninit(); unsafe { blst_miller_loop_n(out.as_mut_ptr(), &qs[0], &ps[0], n_elems); out.assume_init() } } #[cfg(feature = "std")] pub fn miller_loop_n(q: &[blst_p2_affine], p: &[blst_p1_affine]) -> Self { let n_elems = q.len(); if n_elems != p.len() || n_elems == 0 { panic!("inputs' lengths mismatch"); } let pool = mt::da_pool(); let mut n_workers = pool.max_count(); if n_workers == 1 { let qs: [*const _; 2] = [&q[0], ptr::null()]; let ps: [*const _; 2] = [&p[0], ptr::null()]; let mut out = MaybeUninit::::uninit(); unsafe { blst_miller_loop_n(out.as_mut_ptr(), &qs[0], &ps[0], n_elems); return out.assume_init(); } } let counter = Arc::new(AtomicUsize::new(0)); let stride = core::cmp::min((n_elems + n_workers - 1) / n_workers, 16); n_workers = core::cmp::min((n_elems + stride - 1) / stride, n_workers); let (tx, rx) = sync_channel(n_workers); for _ in 0..n_workers { let tx = tx.clone(); let counter = counter.clone(); pool.joined_execute(move || { let mut acc = blst_fp12::default(); let mut tmp = MaybeUninit::::uninit(); let mut qs: [*const _; 2] = [ptr::null(), ptr::null()]; let mut ps: [*const _; 2] = [ptr::null(), ptr::null()]; loop { let work = counter.fetch_add(stride, Ordering::Relaxed); if work >= n_elems { break; } let n = core::cmp::min(n_elems - work, stride); qs[0] = &q[work]; ps[0] = &p[work]; unsafe { blst_miller_loop_n(tmp.as_mut_ptr(), &qs[0], &ps[0], n); acc *= tmp.assume_init(); } } tx.send(acc).expect("disaster"); }); } let mut acc = rx.recv().unwrap(); for _ in 1..n_workers { acc *= rx.recv().unwrap(); } acc } pub fn final_exp(&self) -> Self { let mut out = MaybeUninit::::uninit(); unsafe { blst_final_exp(out.as_mut_ptr(), self); out.assume_init() } } pub fn in_group(&self) -> bool { unsafe { blst_fp12_in_group(self) } } pub fn finalverify(a: &Self, b: &Self) -> bool { unsafe { blst_fp12_finalverify(a, b) } } pub fn to_bendian(&self) -> [u8; 48 * 12] { let mut out = MaybeUninit::<[u8; 48 * 12]>::uninit(); unsafe { blst_bendian_from_fp12(out.as_mut_ptr() as *mut u8, self); out.assume_init() } } } impl blst_scalar { pub fn hash_to(msg: &[u8], dst: &[u8]) -> Option { unsafe { let mut out = ::default(); let mut elem = [0u8; 48]; blst_expand_message_xmd( elem.as_mut_ptr(), elem.len(), msg.as_ptr(), msg.len(), dst.as_ptr(), dst.len(), ); if blst_scalar_from_be_bytes(&mut out, elem.as_ptr(), elem.len()) { Some(out) } else { None } } } } #[derive(Debug)] pub struct Pairing { v: Box<[u64]>, } impl Pairing { pub fn new(hash_or_encode: bool, dst: &[u8]) -> Self { let v: Vec = vec![0; unsafe { blst_pairing_sizeof() } / 8]; let mut obj = Self { v: v.into_boxed_slice(), }; obj.init(hash_or_encode, dst); obj } pub fn init(&mut self, hash_or_encode: bool, dst: &[u8]) { unsafe { blst_pairing_init( self.ctx(), hash_or_encode, dst.as_ptr(), dst.len(), ) } } fn ctx(&mut self) -> *mut blst_pairing { self.v.as_mut_ptr() as *mut blst_pairing } fn const_ctx(&self) -> *const blst_pairing { self.v.as_ptr() as *const blst_pairing } pub fn aggregate( &mut self, pk: &dyn Any, pk_validate: bool, sig: &dyn Any, sig_groupcheck: bool, msg: &[u8], aug: &[u8], ) -> BLST_ERROR { if pk.is::() { unsafe { blst_pairing_chk_n_aggr_pk_in_g1( self.ctx(), match pk.downcast_ref::() { Some(pk) => pk, None => ptr::null(), }, pk_validate, match sig.downcast_ref::() { Some(sig) => sig, None => ptr::null(), }, sig_groupcheck, msg.as_ptr(), msg.len(), aug.as_ptr(), aug.len(), ) } } else if pk.is::() { unsafe { blst_pairing_chk_n_aggr_pk_in_g2( self.ctx(), match pk.downcast_ref::() { Some(pk) => pk, None => ptr::null(), }, pk_validate, match sig.downcast_ref::() { Some(sig) => sig, None => ptr::null(), }, sig_groupcheck, msg.as_ptr(), msg.len(), aug.as_ptr(), aug.len(), ) } } else { panic!("whaaaa?") } } #[allow(clippy::too_many_arguments)] pub fn mul_n_aggregate( &mut self, pk: &dyn Any, pk_validate: bool, sig: &dyn Any, sig_groupcheck: bool, scalar: &[u8], nbits: usize, msg: &[u8], aug: &[u8], ) -> BLST_ERROR { if pk.is::() { unsafe { blst_pairing_chk_n_mul_n_aggr_pk_in_g1( self.ctx(), match pk.downcast_ref::() { Some(pk) => pk, None => ptr::null(), }, pk_validate, match sig.downcast_ref::() { Some(sig) => sig, None => ptr::null(), }, sig_groupcheck, scalar.as_ptr(), nbits, msg.as_ptr(), msg.len(), aug.as_ptr(), aug.len(), ) } } else if pk.is::() { unsafe { blst_pairing_chk_n_mul_n_aggr_pk_in_g2( self.ctx(), match pk.downcast_ref::() { Some(pk) => pk, None => ptr::null(), }, pk_validate, match sig.downcast_ref::() { Some(sig) => sig, None => ptr::null(), }, sig_groupcheck, scalar.as_ptr(), nbits, msg.as_ptr(), msg.len(), aug.as_ptr(), aug.len(), ) } } else { panic!("whaaaa?") } } pub fn aggregated(gtsig: &mut blst_fp12, sig: &dyn Any) { if sig.is::() { unsafe { blst_aggregated_in_g1( gtsig, sig.downcast_ref::().unwrap(), ) } } else if sig.is::() { unsafe { blst_aggregated_in_g2( gtsig, sig.downcast_ref::().unwrap(), ) } } else { panic!("whaaaa?") } } pub fn commit(&mut self) { unsafe { blst_pairing_commit(self.ctx()) } } pub fn merge(&mut self, ctx1: &Self) -> BLST_ERROR { unsafe { blst_pairing_merge(self.ctx(), ctx1.const_ctx()) } } pub fn finalverify(&self, gtsig: Option<&blst_fp12>) -> bool { unsafe { blst_pairing_finalverify( self.const_ctx(), match gtsig { Some(gtsig) => gtsig, None => ptr::null(), }, ) } } pub fn raw_aggregate(&mut self, q: &blst_p2_affine, p: &blst_p1_affine) { unsafe { blst_pairing_raw_aggregate(self.ctx(), q, p) } } pub fn as_fp12(&mut self) -> blst_fp12 { unsafe { *blst_pairing_as_fp12(self.ctx()) } } } pub fn uniq(msgs: &[&[u8]]) -> bool { let n_elems = msgs.len(); if n_elems == 1 { return true; } else if n_elems == 2 { return msgs[0] != msgs[1]; } let mut v: Vec = vec![0; unsafe { blst_uniq_sizeof(n_elems) } / 8]; let ctx = v.as_mut_ptr() as *mut blst_uniq; unsafe { blst_uniq_init(ctx) }; for msg in msgs.iter() { if !unsafe { blst_uniq_test(ctx, msg.as_ptr(), msg.len()) } { return false; } } true } #[cfg(feature = "std")] pub fn print_bytes(bytes: &[u8], name: &str) { print!("{} ", name); for b in bytes.iter() { print!("{:02x}", b); } println!(); } macro_rules! sig_variant_impl { ( $name:expr, $pk:ty, $pk_aff:ty, $sig:ty, $sig_aff:ty, $sk_to_pk:ident, $hash_or_encode:expr, $hash_or_encode_to:ident, $sign:ident, $pk_eq:ident, $sig_eq:ident, $verify:ident, $pk_in_group:ident, $pk_to_aff:ident, $pk_from_aff:ident, $pk_ser:ident, $pk_comp:ident, $pk_deser:ident, $pk_uncomp:ident, $pk_comp_size:expr, $pk_ser_size:expr, $sig_in_group:ident, $sig_to_aff:ident, $sig_from_aff:ident, $sig_ser:ident, $sig_comp:ident, $sig_deser:ident, $sig_uncomp:ident, $sig_comp_size:expr, $sig_ser_size:expr, $pk_add_or_dbl:ident, $pk_add_or_dbl_aff:ident, $pk_cneg:ident, $sig_add_or_dbl:ident, $sig_add_or_dbl_aff:ident, $pk_is_inf:ident, $sig_is_inf:ident, $sig_aggr_in_group:ident, ) => { /// Secret Key #[repr(transparent)] #[derive(Default, Debug, Clone, Zeroize)] #[zeroize(drop)] pub struct SecretKey { value: blst_scalar, } impl SecretKey { /// Deterministically generate a secret key from key material pub fn key_gen( ikm: &[u8], key_info: &[u8], ) -> Result { if ikm.len() < 32 { return Err(BLST_ERROR::BLST_BAD_ENCODING); } let mut sk = SecretKey::default(); unsafe { blst_keygen( &mut sk.value, ikm.as_ptr(), ikm.len(), key_info.as_ptr(), key_info.len(), ); } Ok(sk) } pub fn key_gen_v3( ikm: &[u8], key_info: &[u8], ) -> Result { if ikm.len() < 32 { return Err(BLST_ERROR::BLST_BAD_ENCODING); } let mut sk = SecretKey::default(); unsafe { blst_keygen_v3( &mut sk.value, ikm.as_ptr(), ikm.len(), key_info.as_ptr(), key_info.len(), ); } Ok(sk) } pub fn key_gen_v4_5( ikm: &[u8], salt: &[u8], info: &[u8], ) -> Result { if ikm.len() < 32 { return Err(BLST_ERROR::BLST_BAD_ENCODING); } let mut sk = SecretKey::default(); unsafe { blst_keygen_v4_5( &mut sk.value, ikm.as_ptr(), ikm.len(), salt.as_ptr(), salt.len(), info.as_ptr(), info.len(), ); } Ok(sk) } pub fn key_gen_v5( ikm: &[u8], salt: &[u8], info: &[u8], ) -> Result { if ikm.len() < 32 { return Err(BLST_ERROR::BLST_BAD_ENCODING); } let mut sk = SecretKey::default(); unsafe { blst_keygen_v5( &mut sk.value, ikm.as_ptr(), ikm.len(), salt.as_ptr(), salt.len(), info.as_ptr(), info.len(), ); } Ok(sk) } pub fn derive_master_eip2333( ikm: &[u8], ) -> Result { if ikm.len() < 32 { return Err(BLST_ERROR::BLST_BAD_ENCODING); } let mut sk = SecretKey::default(); unsafe { blst_derive_master_eip2333( &mut sk.value, ikm.as_ptr(), ikm.len(), ); } Ok(sk) } pub fn derive_child_eip2333(&self, child_index: u32) -> Self { let mut sk = SecretKey::default(); unsafe { blst_derive_child_eip2333( &mut sk.value, &self.value, child_index, ); } sk } // sk_to_pk pub fn sk_to_pk(&self) -> PublicKey { // TODO - would the user like the serialized/compressed pk as well? let mut pk_aff = PublicKey::default(); //let mut pk_ser = [0u8; $pk_ser_size]; unsafe { $sk_to_pk( //pk_ser.as_mut_ptr(), ptr::null_mut(), &mut pk_aff.point, &self.value, ); } pk_aff } // Sign pub fn sign( &self, msg: &[u8], dst: &[u8], aug: &[u8], ) -> Signature { // TODO - would the user like the serialized/compressed sig as well? let mut q = <$sig>::default(); let mut sig_aff = <$sig_aff>::default(); //let mut sig_ser = [0u8; $sig_ser_size]; unsafe { $hash_or_encode_to( &mut q, msg.as_ptr(), msg.len(), dst.as_ptr(), dst.len(), aug.as_ptr(), aug.len(), ); $sign(ptr::null_mut(), &mut sig_aff, &q, &self.value); } Signature { point: sig_aff } } // TODO - formally speaking application is entitled to have // ultimate control over secret key storage, which means that // corresponding serialization/deserialization subroutines // should accept reference to where to store the result, as // opposite to returning one. // serialize pub fn serialize(&self) -> [u8; 32] { let mut sk_out = [0; 32]; unsafe { blst_bendian_from_scalar(sk_out.as_mut_ptr(), &self.value); } sk_out } // deserialize pub fn deserialize(sk_in: &[u8]) -> Result { let mut sk = blst_scalar::default(); if sk_in.len() != 32 { return Err(BLST_ERROR::BLST_BAD_ENCODING); } unsafe { blst_scalar_from_bendian(&mut sk, sk_in.as_ptr()); if !blst_sk_check(&sk) { return Err(BLST_ERROR::BLST_BAD_ENCODING); } } Ok(Self { value: sk }) } pub fn to_bytes(&self) -> [u8; 32] { SecretKey::serialize(&self) } pub fn from_bytes(sk_in: &[u8]) -> Result { SecretKey::deserialize(sk_in) } } #[cfg(feature = "serde-secret")] impl Serialize for SecretKey { fn serialize( &self, ser: S, ) -> Result { let bytes = zeroize::Zeroizing::new(self.serialize()); ser.serialize_bytes(bytes.as_ref()) } } #[cfg(feature = "serde-secret")] impl<'de> Deserialize<'de> for SecretKey { fn deserialize>( deser: D, ) -> Result { let bytes: &[u8] = Deserialize::deserialize(deser)?; Self::deserialize(bytes).map_err(|e| { ::custom(format!("{:?}", e)) }) } } // From traits are not provided to discourage duplication // of the secret key material. impl<'a> From<&'a SecretKey> for &'a blst_scalar { fn from(sk: &'a SecretKey) -> Self { unsafe { transmute::<&SecretKey, Self>(sk) } } } impl<'a> core::convert::TryFrom<&'a blst_scalar> for &'a SecretKey { type Error = BLST_ERROR; fn try_from(sk: &'a blst_scalar) -> Result { unsafe { if !blst_sk_check(sk) { return Err(BLST_ERROR::BLST_BAD_ENCODING); } Ok(transmute::<&blst_scalar, Self>(sk)) } } } #[repr(transparent)] #[derive(Default, Debug, Clone, Copy)] pub struct PublicKey { point: $pk_aff, } impl PublicKey { // Core operations // key_validate pub fn validate(&self) -> Result<(), BLST_ERROR> { unsafe { if $pk_is_inf(&self.point) { return Err(BLST_ERROR::BLST_PK_IS_INFINITY); } if !$pk_in_group(&self.point) { return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); } } Ok(()) } pub fn key_validate(key: &[u8]) -> Result { let pk = PublicKey::from_bytes(key)?; pk.validate()?; Ok(pk) } pub fn from_aggregate(agg_pk: &AggregatePublicKey) -> Self { let mut pk_aff = <$pk_aff>::default(); unsafe { $pk_to_aff(&mut pk_aff, &agg_pk.point); } Self { point: pk_aff } } // Serdes pub fn compress(&self) -> [u8; $pk_comp_size] { let mut pk_comp = [0u8; $pk_comp_size]; unsafe { $pk_comp(pk_comp.as_mut_ptr(), &self.point); } pk_comp } pub fn serialize(&self) -> [u8; $pk_ser_size] { let mut pk_out = [0u8; $pk_ser_size]; unsafe { $pk_ser(pk_out.as_mut_ptr(), &self.point); } pk_out } pub fn uncompress(pk_comp: &[u8]) -> Result { if pk_comp.len() == $pk_comp_size && (pk_comp[0] & 0x80) != 0 { let mut pk = <$pk_aff>::default(); let err = unsafe { $pk_uncomp(&mut pk, pk_comp.as_ptr()) }; if err != BLST_ERROR::BLST_SUCCESS { return Err(err); } Ok(Self { point: pk }) } else { Err(BLST_ERROR::BLST_BAD_ENCODING) } } pub fn deserialize(pk_in: &[u8]) -> Result { if (pk_in.len() == $pk_ser_size && (pk_in[0] & 0x80) == 0) || (pk_in.len() == $pk_comp_size && (pk_in[0] & 0x80) != 0) { let mut pk = <$pk_aff>::default(); let err = unsafe { $pk_deser(&mut pk, pk_in.as_ptr()) }; if err != BLST_ERROR::BLST_SUCCESS { return Err(err); } Ok(Self { point: pk }) } else { Err(BLST_ERROR::BLST_BAD_ENCODING) } } pub fn from_bytes(pk_in: &[u8]) -> Result { PublicKey::deserialize(pk_in) } pub fn to_bytes(&self) -> [u8; $pk_comp_size] { self.compress() } } // Trait for equality comparisons which are equivalence relations. // // This means, that in addition to a == b and a != b being strict // inverses, the equality must be reflexive, symmetric and transitive. impl Eq for PublicKey {} impl PartialEq for PublicKey { fn eq(&self, other: &Self) -> bool { unsafe { $pk_eq(&self.point, &other.point) } } } #[cfg(feature = "serde")] impl Serialize for PublicKey { fn serialize( &self, ser: S, ) -> Result { ser.serialize_bytes(&self.serialize()) } } #[cfg(feature = "serde")] impl<'de> Deserialize<'de> for PublicKey { fn deserialize>( deser: D, ) -> Result { let bytes: &[u8] = Deserialize::deserialize(deser)?; Self::deserialize(&bytes).map_err(|e| { ::custom(format!("{:?}", e)) }) } } impl From for $pk_aff { fn from(pk: PublicKey) -> Self { pk.point } } impl<'a> From<&'a PublicKey> for &'a $pk_aff { fn from(pk: &'a PublicKey) -> Self { &pk.point } } impl From<$pk_aff> for PublicKey { fn from(point: $pk_aff) -> Self { Self { point } } } #[repr(transparent)] #[derive(Debug, Clone, Copy)] pub struct AggregatePublicKey { point: $pk, } impl AggregatePublicKey { pub fn from_public_key(pk: &PublicKey) -> Self { let mut agg_pk = <$pk>::default(); unsafe { $pk_from_aff(&mut agg_pk, &pk.point); } Self { point: agg_pk } } pub fn to_public_key(&self) -> PublicKey { let mut pk = <$pk_aff>::default(); unsafe { $pk_to_aff(&mut pk, &self.point); } PublicKey { point: pk } } // Aggregate pub fn aggregate( pks: &[&PublicKey], pks_validate: bool, ) -> Result { if pks.len() == 0 { return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); } if pks_validate { pks[0].validate()?; } let mut agg_pk = AggregatePublicKey::from_public_key(pks[0]); for s in pks.iter().skip(1) { if pks_validate { s.validate()?; } unsafe { $pk_add_or_dbl_aff( &mut agg_pk.point, &agg_pk.point, &s.point, ); } } Ok(agg_pk) } pub fn aggregate_with_randomness( pks: &[PublicKey], randomness: &[u8], nbits: usize, pks_groupcheck: bool, ) -> Result { if pks.len() == 0 { return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); } if pks_groupcheck { pks.validate()?; } Ok(pks.mult(randomness, nbits)) } pub fn aggregate_serialized( pks: &[&[u8]], pks_validate: bool, ) -> Result { // TODO - threading if pks.len() == 0 { return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); } let mut pk = if pks_validate { PublicKey::key_validate(pks[0])? } else { PublicKey::from_bytes(pks[0])? }; let mut agg_pk = AggregatePublicKey::from_public_key(&pk); for s in pks.iter().skip(1) { pk = if pks_validate { PublicKey::key_validate(s)? } else { PublicKey::from_bytes(s)? }; unsafe { $pk_add_or_dbl_aff( &mut agg_pk.point, &agg_pk.point, &pk.point, ); } } Ok(agg_pk) } pub fn add_aggregate(&mut self, agg_pk: &AggregatePublicKey) { unsafe { $pk_add_or_dbl(&mut self.point, &self.point, &agg_pk.point); } } pub fn sub_aggregate(&mut self, agg_pk: &AggregatePublicKey) { unsafe { let mut tmp = agg_pk.clone(); $pk_cneg(&mut tmp.point, true); $pk_add_or_dbl(&mut self.point, &self.point, &tmp.point); } } pub fn add_public_key( &mut self, pk: &PublicKey, pk_validate: bool, ) -> Result<(), BLST_ERROR> { if pk_validate { pk.validate()?; } unsafe { $pk_add_or_dbl_aff(&mut self.point, &self.point, &pk.point); } Ok(()) } } impl From for $pk { fn from(pk: AggregatePublicKey) -> Self { pk.point } } impl<'a> From<&'a AggregatePublicKey> for &'a $pk { fn from(pk: &'a AggregatePublicKey) -> Self { &pk.point } } impl From<$pk> for AggregatePublicKey { fn from(point: $pk) -> Self { Self { point } } } #[repr(transparent)] #[derive(Debug, Clone, Copy)] pub struct Signature { point: $sig_aff, } impl Signature { // sig_infcheck, check for infinity, is a way to avoid going // into resource-consuming verification. Passing 'false' is // always cryptographically safe, but application might want // to guard against obviously bogus individual[!] signatures. pub fn validate( &self, sig_infcheck: bool, ) -> Result<(), BLST_ERROR> { unsafe { if sig_infcheck && $sig_is_inf(&self.point) { return Err(BLST_ERROR::BLST_PK_IS_INFINITY); } if !$sig_in_group(&self.point) { return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); } } Ok(()) } pub fn sig_validate( sig: &[u8], sig_infcheck: bool, ) -> Result { let sig = Signature::from_bytes(sig)?; sig.validate(sig_infcheck)?; Ok(sig) } pub fn verify( &self, sig_groupcheck: bool, msg: &[u8], dst: &[u8], aug: &[u8], pk: &PublicKey, pk_validate: bool, ) -> BLST_ERROR { let aug_msg = [aug, msg].concat(); self.aggregate_verify( sig_groupcheck, &[aug_msg.as_slice()], dst, &[pk], pk_validate, ) } #[cfg(not(feature = "std"))] pub fn aggregate_verify( &self, sig_groupcheck: bool, msgs: &[&[u8]], dst: &[u8], pks: &[&PublicKey], pks_validate: bool, ) -> BLST_ERROR { let n_elems = pks.len(); if n_elems == 0 || msgs.len() != n_elems { return BLST_ERROR::BLST_VERIFY_FAIL; } let mut pairing = Pairing::new($hash_or_encode, dst); let err = pairing.aggregate( &pks[0].point, pks_validate, &self.point, sig_groupcheck, &msgs[0], &[], ); if err != BLST_ERROR::BLST_SUCCESS { return err; } for i in 1..n_elems { let err = pairing.aggregate( &pks[i].point, pks_validate, &unsafe { ptr::null::<$sig_aff>().as_ref() }, false, &msgs[i], &[], ); if err != BLST_ERROR::BLST_SUCCESS { return err; } } pairing.commit(); if pairing.finalverify(None) { BLST_ERROR::BLST_SUCCESS } else { BLST_ERROR::BLST_VERIFY_FAIL } } #[cfg(feature = "std")] pub fn aggregate_verify( &self, sig_groupcheck: bool, msgs: &[&[u8]], dst: &[u8], pks: &[&PublicKey], pks_validate: bool, ) -> BLST_ERROR { let n_elems = pks.len(); if n_elems == 0 || msgs.len() != n_elems { return BLST_ERROR::BLST_VERIFY_FAIL; } // TODO - check msg uniqueness? let pool = mt::da_pool(); let counter = Arc::new(AtomicUsize::new(0)); let valid = Arc::new(AtomicBool::new(true)); let n_workers = core::cmp::min(pool.max_count(), n_elems); let (tx, rx) = sync_channel(n_workers); for _ in 0..n_workers { let tx = tx.clone(); let counter = counter.clone(); let valid = valid.clone(); pool.joined_execute(move || { let mut pairing = Pairing::new($hash_or_encode, dst); while valid.load(Ordering::Relaxed) { let work = counter.fetch_add(1, Ordering::Relaxed); if work >= n_elems { break; } if pairing.aggregate( &pks[work].point, pks_validate, &unsafe { ptr::null::<$sig_aff>().as_ref() }, false, &msgs[work], &[], ) != BLST_ERROR::BLST_SUCCESS { valid.store(false, Ordering::Relaxed); break; } } if valid.load(Ordering::Relaxed) { pairing.commit(); } tx.send(pairing).expect("disaster"); }); } if sig_groupcheck && valid.load(Ordering::Relaxed) { match self.validate(false) { Err(_err) => valid.store(false, Ordering::Relaxed), _ => (), } } let mut gtsig = blst_fp12::default(); if valid.load(Ordering::Relaxed) { Pairing::aggregated(&mut gtsig, &self.point); } let mut acc = rx.recv().unwrap(); for _ in 1..n_workers { acc.merge(&rx.recv().unwrap()); } if valid.load(Ordering::Relaxed) && acc.finalverify(Some(>sig)) { BLST_ERROR::BLST_SUCCESS } else { BLST_ERROR::BLST_VERIFY_FAIL } } // pks are assumed to be verified for proof of possession, // which implies that they are already group-checked pub fn fast_aggregate_verify( &self, sig_groupcheck: bool, msg: &[u8], dst: &[u8], pks: &[&PublicKey], ) -> BLST_ERROR { let agg_pk = match AggregatePublicKey::aggregate(pks, false) { Ok(agg_sig) => agg_sig, Err(err) => return err, }; let pk = agg_pk.to_public_key(); self.aggregate_verify( sig_groupcheck, &[msg], dst, &[&pk], false, ) } pub fn fast_aggregate_verify_pre_aggregated( &self, sig_groupcheck: bool, msg: &[u8], dst: &[u8], pk: &PublicKey, ) -> BLST_ERROR { self.aggregate_verify(sig_groupcheck, &[msg], dst, &[pk], false) } // https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407 #[cfg(feature = "std")] #[allow(clippy::too_many_arguments)] pub fn verify_multiple_aggregate_signatures( msgs: &[&[u8]], dst: &[u8], pks: &[&PublicKey], pks_validate: bool, sigs: &[&Signature], sigs_groupcheck: bool, rands: &[blst_scalar], rand_bits: usize, ) -> BLST_ERROR { let n_elems = pks.len(); if n_elems == 0 || msgs.len() != n_elems || sigs.len() != n_elems || rands.len() != n_elems { return BLST_ERROR::BLST_VERIFY_FAIL; } // TODO - check msg uniqueness? let pool = mt::da_pool(); let counter = Arc::new(AtomicUsize::new(0)); let valid = Arc::new(AtomicBool::new(true)); let n_workers = core::cmp::min(pool.max_count(), n_elems); let (tx, rx) = sync_channel(n_workers); for _ in 0..n_workers { let tx = tx.clone(); let counter = counter.clone(); let valid = valid.clone(); pool.joined_execute(move || { let mut pairing = Pairing::new($hash_or_encode, dst); // TODO - engage multi-point mul-n-add for larger // amount of inputs... while valid.load(Ordering::Relaxed) { let work = counter.fetch_add(1, Ordering::Relaxed); if work >= n_elems { break; } if pairing.mul_n_aggregate( &pks[work].point, pks_validate, &sigs[work].point, sigs_groupcheck, &rands[work].b, rand_bits, msgs[work], &[], ) != BLST_ERROR::BLST_SUCCESS { valid.store(false, Ordering::Relaxed); break; } } if valid.load(Ordering::Relaxed) { pairing.commit(); } tx.send(pairing).expect("disaster"); }); } let mut acc = rx.recv().unwrap(); for _ in 1..n_workers { acc.merge(&rx.recv().unwrap()); } if valid.load(Ordering::Relaxed) && acc.finalverify(None) { BLST_ERROR::BLST_SUCCESS } else { BLST_ERROR::BLST_VERIFY_FAIL } } #[cfg(not(feature = "std"))] #[allow(clippy::too_many_arguments)] pub fn verify_multiple_aggregate_signatures( msgs: &[&[u8]], dst: &[u8], pks: &[&PublicKey], pks_validate: bool, sigs: &[&Signature], sigs_groupcheck: bool, rands: &[blst_scalar], rand_bits: usize, ) -> BLST_ERROR { let n_elems = pks.len(); if n_elems == 0 || msgs.len() != n_elems || sigs.len() != n_elems || rands.len() != n_elems { return BLST_ERROR::BLST_VERIFY_FAIL; } // TODO - check msg uniqueness? let mut pairing = Pairing::new($hash_or_encode, dst); for i in 0..n_elems { let err = pairing.mul_n_aggregate( &pks[i].point, pks_validate, &sigs[i].point, sigs_groupcheck, &rands[i].b, rand_bits, msgs[i], &[], ); if err != BLST_ERROR::BLST_SUCCESS { return err; } } pairing.commit(); if pairing.finalverify(None) { BLST_ERROR::BLST_SUCCESS } else { BLST_ERROR::BLST_VERIFY_FAIL } } pub fn from_aggregate(agg_sig: &AggregateSignature) -> Self { let mut sig_aff = <$sig_aff>::default(); unsafe { $sig_to_aff(&mut sig_aff, &agg_sig.point); } Self { point: sig_aff } } pub fn compress(&self) -> [u8; $sig_comp_size] { let mut sig_comp = [0; $sig_comp_size]; unsafe { $sig_comp(sig_comp.as_mut_ptr(), &self.point); } sig_comp } pub fn serialize(&self) -> [u8; $sig_ser_size] { let mut sig_out = [0; $sig_ser_size]; unsafe { $sig_ser(sig_out.as_mut_ptr(), &self.point); } sig_out } pub fn uncompress(sig_comp: &[u8]) -> Result { if sig_comp.len() == $sig_comp_size && (sig_comp[0] & 0x80) != 0 { let mut sig = <$sig_aff>::default(); let err = unsafe { $sig_uncomp(&mut sig, sig_comp.as_ptr()) }; if err != BLST_ERROR::BLST_SUCCESS { return Err(err); } Ok(Self { point: sig }) } else { Err(BLST_ERROR::BLST_BAD_ENCODING) } } pub fn deserialize(sig_in: &[u8]) -> Result { if (sig_in.len() == $sig_ser_size && (sig_in[0] & 0x80) == 0) || (sig_in.len() == $sig_comp_size && (sig_in[0] & 0x80) != 0) { let mut sig = <$sig_aff>::default(); let err = unsafe { $sig_deser(&mut sig, sig_in.as_ptr()) }; if err != BLST_ERROR::BLST_SUCCESS { return Err(err); } Ok(Self { point: sig }) } else { Err(BLST_ERROR::BLST_BAD_ENCODING) } } pub fn from_bytes(sig_in: &[u8]) -> Result { Signature::deserialize(sig_in) } pub fn to_bytes(&self) -> [u8; $sig_comp_size] { self.compress() } pub fn subgroup_check(&self) -> bool { unsafe { $sig_in_group(&self.point) } } } // Trait for equality comparisons which are equivalence relations. // // This means, that in addition to a == b and a != b being strict // inverses, the equality must be reflexive, symmetric and transitive. impl Eq for Signature {} impl PartialEq for Signature { fn eq(&self, other: &Self) -> bool { unsafe { $sig_eq(&self.point, &other.point) } } } #[cfg(feature = "serde")] impl Serialize for Signature { fn serialize( &self, ser: S, ) -> Result { ser.serialize_bytes(&self.serialize()) } } #[cfg(feature = "serde")] impl<'de> Deserialize<'de> for Signature { fn deserialize>( deser: D, ) -> Result { let bytes: &[u8] = Deserialize::deserialize(deser)?; Self::deserialize(&bytes).map_err(|e| { ::custom(format!("{:?}", e)) }) } } impl From for $sig_aff { fn from(sig: Signature) -> Self { sig.point } } impl<'a> From<&'a Signature> for &'a $sig_aff { fn from(sig: &'a Signature) -> Self { &sig.point } } impl From<$sig_aff> for Signature { fn from(point: $sig_aff) -> Self { Self { point } } } #[repr(transparent)] #[derive(Debug, Clone, Copy)] pub struct AggregateSignature { point: $sig, } impl AggregateSignature { pub fn validate(&self) -> Result<(), BLST_ERROR> { unsafe { if !$sig_aggr_in_group(&self.point) { return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); } } Ok(()) } pub fn from_signature(sig: &Signature) -> Self { let mut agg_sig = <$sig>::default(); unsafe { $sig_from_aff(&mut agg_sig, &sig.point); } Self { point: agg_sig } } pub fn to_signature(&self) -> Signature { let mut sig = <$sig_aff>::default(); unsafe { $sig_to_aff(&mut sig, &self.point); } Signature { point: sig } } // Aggregate pub fn aggregate( sigs: &[&Signature], sigs_groupcheck: bool, ) -> Result { if sigs.len() == 0 { return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); } if sigs_groupcheck { // We can't actually judge if input is individual or // aggregated signature, so we can't enforce infinity // check. sigs[0].validate(false)?; } let mut agg_sig = AggregateSignature::from_signature(sigs[0]); for s in sigs.iter().skip(1) { if sigs_groupcheck { s.validate(false)?; } unsafe { $sig_add_or_dbl_aff( &mut agg_sig.point, &agg_sig.point, &s.point, ); } } Ok(agg_sig) } pub fn aggregate_with_randomness( sigs: &[Signature], randomness: &[u8], nbits: usize, sigs_groupcheck: bool, ) -> Result { if sigs.len() == 0 { return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); } if sigs_groupcheck { sigs.validate()?; } Ok(sigs.mult(randomness, nbits)) } pub fn aggregate_serialized( sigs: &[&[u8]], sigs_groupcheck: bool, ) -> Result { // TODO - threading if sigs.len() == 0 { return Err(BLST_ERROR::BLST_AGGR_TYPE_MISMATCH); } let mut sig = if sigs_groupcheck { Signature::sig_validate(sigs[0], false)? } else { Signature::from_bytes(sigs[0])? }; let mut agg_sig = AggregateSignature::from_signature(&sig); for s in sigs.iter().skip(1) { sig = if sigs_groupcheck { Signature::sig_validate(s, false)? } else { Signature::from_bytes(s)? }; unsafe { $sig_add_or_dbl_aff( &mut agg_sig.point, &agg_sig.point, &sig.point, ); } } Ok(agg_sig) } pub fn add_aggregate(&mut self, agg_sig: &AggregateSignature) { unsafe { $sig_add_or_dbl( &mut self.point, &self.point, &agg_sig.point, ); } } pub fn add_signature( &mut self, sig: &Signature, sig_groupcheck: bool, ) -> Result<(), BLST_ERROR> { if sig_groupcheck { sig.validate(false)?; } unsafe { $sig_add_or_dbl_aff( &mut self.point, &self.point, &sig.point, ); } Ok(()) } pub fn subgroup_check(&self) -> bool { unsafe { $sig_aggr_in_group(&self.point) } } } impl From for $sig { fn from(sig: AggregateSignature) -> Self { sig.point } } impl<'a> From<&'a AggregateSignature> for &'a $sig { fn from(sig: &'a AggregateSignature) -> Self { &sig.point } } impl From<$sig> for AggregateSignature { fn from(point: $sig) -> Self { Self { point } } } impl MultiPoint for [PublicKey] { type Output = AggregatePublicKey; fn mult(&self, scalars: &[u8], nbits: usize) -> Self::Output { Self::Output { point: unsafe { transmute::<&[_], &[$pk_aff]>(self) } .mult(scalars, nbits), } } fn add(&self) -> Self::Output { Self::Output { point: unsafe { transmute::<&[_], &[$pk_aff]>(self) } .add(), } } fn validate(&self) -> Result<(), BLST_ERROR> { unsafe { transmute::<&[_], &[$pk_aff]>(self) }.validate() } } impl MultiPoint for [Signature] { type Output = AggregateSignature; fn mult(&self, scalars: &[u8], nbits: usize) -> Self::Output { Self::Output { point: unsafe { transmute::<&[_], &[$sig_aff]>(self) } .mult(scalars, nbits), } } fn add(&self) -> Self::Output { Self::Output { point: unsafe { transmute::<&[_], &[$sig_aff]>(self) } .add(), } } fn validate(&self) -> Result<(), BLST_ERROR> { unsafe { transmute::<&[_], &[$sig_aff]>(self) }.validate() } } #[cfg(test)] mod tests { use super::*; use rand::{RngCore, SeedableRng}; use rand_chacha::ChaCha20Rng; // Testing only - do not use for production pub fn gen_random_key( rng: &mut rand_chacha::ChaCha20Rng, ) -> SecretKey { let mut ikm = [0u8; 32]; rng.fill_bytes(&mut ikm); let mut sk = ::default(); unsafe { blst_keygen(&mut sk, ikm.as_ptr(), 32, ptr::null(), 0); } SecretKey { value: sk } } #[test] fn test_sign_n_verify() { let ikm: [u8; 32] = [ 0x93, 0xad, 0x7e, 0x65, 0xde, 0xad, 0x05, 0x2a, 0x08, 0x3a, 0x91, 0x0c, 0x8b, 0x72, 0x85, 0x91, 0x46, 0x4c, 0xca, 0x56, 0x60, 0x5b, 0xb0, 0x56, 0xed, 0xfe, 0x2b, 0x60, 0xa6, 0x3c, 0x48, 0x99, ]; let sk = SecretKey::key_gen(&ikm, &[]).unwrap(); let pk = sk.sk_to_pk(); let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; let msg = b"hello foo"; let sig = sk.sign(msg, dst, &[]); let err = sig.verify(true, msg, dst, &[], &pk, true); assert_eq!(err, BLST_ERROR::BLST_SUCCESS); } #[test] fn test_aggregate() { let num_msgs = 10; let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_NUL_"; let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let sks: Vec<_> = (0..num_msgs).map(|_| gen_random_key(&mut rng)).collect(); let pks = sks.iter().map(|sk| sk.sk_to_pk()).collect::>(); let pks_refs: Vec<&PublicKey> = pks.iter().map(|pk| pk).collect(); let pks_rev: Vec<&PublicKey> = pks.iter().rev().map(|pk| pk).collect(); let pk_comp = pks[0].compress(); let pk_uncomp = PublicKey::uncompress(&pk_comp); assert_eq!(pk_uncomp.is_ok(), true); let mut msgs: Vec> = vec![vec![]; num_msgs]; for i in 0..num_msgs { let msg_len = (rng.next_u64() & 0x3F) + 1; msgs[i] = vec![0u8; msg_len as usize]; rng.fill_bytes(&mut msgs[i]); } let msgs_refs: Vec<&[u8]> = msgs.iter().map(|m| m.as_slice()).collect(); let sigs = sks .iter() .zip(msgs.iter()) .map(|(sk, m)| (sk.sign(m, dst, &[]))) .collect::>(); let mut errs = sigs .iter() .zip(msgs.iter()) .zip(pks.iter()) .map(|((s, m), pk)| (s.verify(true, m, dst, &[], pk, true))) .collect::>(); assert_eq!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_msgs]); // Swap message/public key pairs to create bad signature errs = sigs .iter() .zip(msgs.iter()) .zip(pks.iter().rev()) .map(|((s, m), pk)| (s.verify(true, m, dst, &[], pk, true))) .collect::>(); assert_ne!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_msgs]); let sig_refs = sigs.iter().map(|s| s).collect::>(); let agg = match AggregateSignature::aggregate(&sig_refs, true) { Ok(agg) => agg, Err(err) => panic!("aggregate failure: {:?}", err), }; let agg_sig = agg.to_signature(); let mut result = agg_sig .aggregate_verify(false, &msgs_refs, dst, &pks_refs, false); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); // Swap message/public key pairs to create bad signature result = agg_sig .aggregate_verify(false, &msgs_refs, dst, &pks_rev, false); assert_ne!(result, BLST_ERROR::BLST_SUCCESS); } #[test] fn test_multiple_agg_sigs() { let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; let num_pks_per_sig = 10; let num_sigs = 10; let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let mut msgs: Vec> = vec![vec![]; num_sigs]; let mut sigs: Vec = Vec::with_capacity(num_sigs); let mut pks: Vec = Vec::with_capacity(num_sigs); let mut rands: Vec = Vec::with_capacity(num_sigs); for i in 0..num_sigs { // Create public keys let sks_i: Vec<_> = (0..num_pks_per_sig) .map(|_| gen_random_key(&mut rng)) .collect(); let pks_i = sks_i .iter() .map(|sk| sk.sk_to_pk()) .collect::>(); let pks_refs_i: Vec<&PublicKey> = pks_i.iter().map(|pk| pk).collect(); // Create random message for pks to all sign let msg_len = (rng.next_u64() & 0x3F) + 1; msgs[i] = vec![0u8; msg_len as usize]; rng.fill_bytes(&mut msgs[i]); // Generate signature for each key pair let sigs_i = sks_i .iter() .map(|sk| sk.sign(&msgs[i], dst, &[])) .collect::>(); // Test each current single signature let errs = sigs_i .iter() .zip(pks_i.iter()) .map(|(s, pk)| { (s.verify(true, &msgs[i], dst, &[], pk, true)) }) .collect::>(); assert_eq!( errs, vec![BLST_ERROR::BLST_SUCCESS; num_pks_per_sig] ); let sig_refs_i = sigs_i.iter().map(|s| s).collect::>(); let agg_i = match AggregateSignature::aggregate(&sig_refs_i, false) { Ok(agg_i) => agg_i, Err(err) => panic!("aggregate failure: {:?}", err), }; // Test current aggregate signature sigs.push(agg_i.to_signature()); let mut result = sigs[i].fast_aggregate_verify( false, &msgs[i], dst, &pks_refs_i, ); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); // negative test if i != 0 { result = sigs[i - 1].fast_aggregate_verify( false, &msgs[i], dst, &pks_refs_i, ); assert_ne!(result, BLST_ERROR::BLST_SUCCESS); } // aggregate public keys and push into vec let agg_pk_i = match AggregatePublicKey::aggregate(&pks_refs_i, false) { Ok(agg_pk_i) => agg_pk_i, Err(err) => panic!("aggregate failure: {:?}", err), }; pks.push(agg_pk_i.to_public_key()); // Test current aggregate signature with aggregated pks result = sigs[i].fast_aggregate_verify_pre_aggregated( false, &msgs[i], dst, &pks[i], ); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); // negative test if i != 0 { result = sigs[i - 1] .fast_aggregate_verify_pre_aggregated( false, &msgs[i], dst, &pks[i], ); assert_ne!(result, BLST_ERROR::BLST_SUCCESS); } // create random values let mut vals = [0u64; 4]; vals[0] = rng.next_u64(); while vals[0] == 0 { // Reject zero as it is used for multiplication. vals[0] = rng.next_u64(); } let mut rand_i = MaybeUninit::::uninit(); unsafe { blst_scalar_from_uint64( rand_i.as_mut_ptr(), vals.as_ptr(), ); rands.push(rand_i.assume_init()); } } let msgs_refs: Vec<&[u8]> = msgs.iter().map(|m| m.as_slice()).collect(); let sig_refs = sigs.iter().map(|s| s).collect::>(); let pks_refs: Vec<&PublicKey> = pks.iter().map(|pk| pk).collect(); let msgs_rev: Vec<&[u8]> = msgs.iter().rev().map(|m| m.as_slice()).collect(); let sig_rev = sigs.iter().rev().map(|s| s).collect::>(); let pks_rev: Vec<&PublicKey> = pks.iter().rev().map(|pk| pk).collect(); let mut result = Signature::verify_multiple_aggregate_signatures( &msgs_refs, dst, &pks_refs, false, &sig_refs, true, &rands, 64, ); assert_eq!(result, BLST_ERROR::BLST_SUCCESS); // negative tests (use reverse msgs, pks, and sigs) result = Signature::verify_multiple_aggregate_signatures( &msgs_rev, dst, &pks_refs, false, &sig_refs, true, &rands, 64, ); assert_ne!(result, BLST_ERROR::BLST_SUCCESS); result = Signature::verify_multiple_aggregate_signatures( &msgs_refs, dst, &pks_rev, false, &sig_refs, true, &rands, 64, ); assert_ne!(result, BLST_ERROR::BLST_SUCCESS); result = Signature::verify_multiple_aggregate_signatures( &msgs_refs, dst, &pks_refs, false, &sig_rev, true, &rands, 64, ); assert_ne!(result, BLST_ERROR::BLST_SUCCESS); } #[test] fn test_serialization() { let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); let sk = gen_random_key(&mut rng); let sk2 = gen_random_key(&mut rng); let pk = sk.sk_to_pk(); let pk_comp = pk.compress(); let pk_ser = pk.serialize(); let pk_uncomp = PublicKey::uncompress(&pk_comp); assert_eq!(pk_uncomp.is_ok(), true); assert_eq!(pk_uncomp.unwrap(), pk); let pk_deser = PublicKey::deserialize(&pk_ser); assert_eq!(pk_deser.is_ok(), true); assert_eq!(pk_deser.unwrap(), pk); let pk2 = sk2.sk_to_pk(); let pk_comp2 = pk2.compress(); let pk_ser2 = pk2.serialize(); let pk_uncomp2 = PublicKey::uncompress(&pk_comp2); assert_eq!(pk_uncomp2.is_ok(), true); assert_eq!(pk_uncomp2.unwrap(), pk2); let pk_deser2 = PublicKey::deserialize(&pk_ser2); assert_eq!(pk_deser2.is_ok(), true); assert_eq!(pk_deser2.unwrap(), pk2); assert_ne!(pk, pk2); assert_ne!(pk_uncomp.unwrap(), pk2); assert_ne!(pk_deser.unwrap(), pk2); assert_ne!(pk_uncomp2.unwrap(), pk); assert_ne!(pk_deser2.unwrap(), pk); } #[cfg(feature = "serde")] #[test] fn test_serde() { let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); // generate a sk, pk, and sig, and make sure it signs let sk = gen_random_key(&mut rng); let pk = sk.sk_to_pk(); let sig = sk.sign(b"asdf", b"qwer", b"zxcv"); assert_eq!( sig.verify(true, b"asdf", b"qwer", b"zxcv", &pk, true), BLST_ERROR::BLST_SUCCESS ); // roundtrip through serde let pk_ser = rmp_serde::encode::to_vec_named(&pk).expect("ser pk"); let sig_ser = rmp_serde::encode::to_vec_named(&sig).expect("ser sig"); let pk_des: PublicKey = rmp_serde::decode::from_slice(&pk_ser).expect("des pk"); let sig_des: Signature = rmp_serde::decode::from_slice(&sig_ser).expect("des sig"); // check that we got back the right things assert_eq!(pk, pk_des); assert_eq!(sig, sig_des); assert_eq!( sig.verify(true, b"asdf", b"qwer", b"zxcv", &pk_des, true), BLST_ERROR::BLST_SUCCESS ); assert_eq!( sig_des.verify(true, b"asdf", b"qwer", b"zxcv", &pk, true), BLST_ERROR::BLST_SUCCESS ); assert_eq!(sk.sign(b"asdf", b"qwer", b"zxcv"), sig_des); #[cfg(feature = "serde-secret")] if true { let sk_ser = rmp_serde::encode::to_vec_named(&sk).expect("ser sk"); let sk_des: SecretKey = rmp_serde::decode::from_slice(&sk_ser).expect("des sk"); // BLS signatures are deterministic, so this establishes // that sk == sk_des assert_eq!(sk_des.sign(b"asdf", b"qwer", b"zxcv"), sig); } } #[test] fn test_multi_point() { let dst = b"BLS_SIG_BLS12381G2_XMD:SHA-256_SSWU_RO_POP_"; let num_pks = 13; let seed = [0u8; 32]; let mut rng = ChaCha20Rng::from_seed(seed); // Create public keys let sks: Vec<_> = (0..num_pks).map(|_| gen_random_key(&mut rng)).collect(); let pks = sks.iter().map(|sk| sk.sk_to_pk()).collect::>(); let pks_refs: Vec<&PublicKey> = pks.iter().map(|pk| pk).collect(); // Create random message for pks to all sign let msg_len = (rng.next_u64() & 0x3F) + 1; let mut msg = vec![0u8; msg_len as usize]; rng.fill_bytes(&mut msg); // Generate signature for each key pair let sigs = sks .iter() .map(|sk| sk.sign(&msg, dst, &[])) .collect::>(); let sigs_refs: Vec<&Signature> = sigs.iter().map(|s| s).collect(); // create random values let mut rands: Vec = Vec::with_capacity(8 * num_pks); for _ in 0..num_pks { let mut r = rng.next_u64(); while r == 0 { // Reject zero as it is used for multiplication. r = rng.next_u64(); } rands.extend_from_slice(&r.to_le_bytes()); } // Sanity test each current single signature let errs = sigs .iter() .zip(pks.iter()) .map(|(s, pk)| (s.verify(true, &msg, dst, &[], pk, true))) .collect::>(); assert_eq!(errs, vec![BLST_ERROR::BLST_SUCCESS; num_pks]); // sanity test aggregated signature let agg_pk = AggregatePublicKey::aggregate(&pks_refs, false) .unwrap() .to_public_key(); let agg_sig = AggregateSignature::aggregate(&sigs_refs, false) .unwrap() .to_signature(); let err = agg_sig.verify(true, &msg, dst, &[], &agg_pk, true); assert_eq!(err, BLST_ERROR::BLST_SUCCESS); // test multi-point aggregation using add let agg_pk = pks.add().to_public_key(); let agg_sig = sigs.add().to_signature(); let err = agg_sig.verify(true, &msg, dst, &[], &agg_pk, true); assert_eq!(err, BLST_ERROR::BLST_SUCCESS); // test multi-point aggregation using mult let agg_pk = pks.mult(&rands, 64).to_public_key(); let agg_sig = sigs.mult(&rands, 64).to_signature(); let err = agg_sig.verify(true, &msg, dst, &[], &agg_pk, true); assert_eq!(err, BLST_ERROR::BLST_SUCCESS); } } }; } pub mod min_pk { use super::*; sig_variant_impl!( "MinPk", blst_p1, blst_p1_affine, blst_p2, blst_p2_affine, blst_sk_to_pk2_in_g1, true, blst_hash_to_g2, blst_sign_pk2_in_g1, blst_p1_affine_is_equal, blst_p2_affine_is_equal, blst_core_verify_pk_in_g1, blst_p1_affine_in_g1, blst_p1_to_affine, blst_p1_from_affine, blst_p1_affine_serialize, blst_p1_affine_compress, blst_p1_deserialize, blst_p1_uncompress, 48, 96, blst_p2_affine_in_g2, blst_p2_to_affine, blst_p2_from_affine, blst_p2_affine_serialize, blst_p2_affine_compress, blst_p2_deserialize, blst_p2_uncompress, 96, 192, blst_p1_add_or_double, blst_p1_add_or_double_affine, blst_p1_cneg, blst_p2_add_or_double, blst_p2_add_or_double_affine, blst_p1_affine_is_inf, blst_p2_affine_is_inf, blst_p2_in_g2, ); } pub mod min_sig { use super::*; sig_variant_impl!( "MinSig", blst_p2, blst_p2_affine, blst_p1, blst_p1_affine, blst_sk_to_pk2_in_g2, true, blst_hash_to_g1, blst_sign_pk2_in_g2, blst_p2_affine_is_equal, blst_p1_affine_is_equal, blst_core_verify_pk_in_g2, blst_p2_affine_in_g2, blst_p2_to_affine, blst_p2_from_affine, blst_p2_affine_serialize, blst_p2_affine_compress, blst_p2_deserialize, blst_p2_uncompress, 96, 192, blst_p1_affine_in_g1, blst_p1_to_affine, blst_p1_from_affine, blst_p1_affine_serialize, blst_p1_affine_compress, blst_p1_deserialize, blst_p1_uncompress, 48, 96, blst_p2_add_or_double, blst_p2_add_or_double_affine, blst_p2_cneg, blst_p1_add_or_double, blst_p1_add_or_double_affine, blst_p2_affine_is_inf, blst_p1_affine_is_inf, blst_p1_in_g1, ); } pub trait MultiPoint { type Output; fn mult(&self, scalars: &[u8], nbits: usize) -> Self::Output; fn add(&self) -> Self::Output; fn validate(&self) -> Result<(), BLST_ERROR> { Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP) } } #[cfg(feature = "std")] include!("pippenger.rs"); #[cfg(not(feature = "std"))] include!("pippenger-no_std.rs"); #[cfg(test)] mod fp12_test { use super::*; use rand::{RngCore, SeedableRng}; use rand_chacha::ChaCha20Rng; #[test] fn miller_loop_n() { const npoints: usize = 97; const nbits: usize = 64; const nbytes: usize = (nbits + 7) / 8; let mut scalars = Box::new([0u8; nbytes * npoints]); ChaCha20Rng::from_entropy().fill_bytes(scalars.as_mut()); let mut p1s: Vec = Vec::with_capacity(npoints); let mut p2s: Vec = Vec::with_capacity(npoints); unsafe { p1s.set_len(npoints); p2s.set_len(npoints); for i in 0..npoints { blst_p1_mult( &mut p1s[i], blst_p1_generator(), &scalars[i * nbytes], 32, ); blst_p2_mult( &mut p2s[i], blst_p2_generator(), &scalars[i * nbytes + 4], 32, ); } } let ps = p1_affines::from(&p1s); let qs = p2_affines::from(&p2s); let mut naive = blst_fp12::default(); for i in 0..npoints { naive *= blst_fp12::miller_loop(&qs[i], &ps[i]); } assert_eq!( naive, blst_fp12::miller_loop_n(qs.as_slice(), ps.as_slice()) ); } } #[cfg(test)] mod sk_test { use super::*; use rand::{RngCore, SeedableRng}; use rand_chacha::ChaCha20Rng; #[test] fn inverse() { let mut bytes = [0u8; 64]; ChaCha20Rng::from_entropy().fill_bytes(bytes.as_mut()); let mut sk = blst_scalar::default(); let mut p1 = blst_p1::default(); let mut p2 = blst_p2::default(); unsafe { blst_scalar_from_be_bytes(&mut sk, bytes.as_ptr(), bytes.len()); blst_p1_mult(&mut p1, blst_p1_generator(), sk.b.as_ptr(), 255); blst_sk_inverse(&mut sk, &sk); blst_p1_mult(&mut p1, &p1, sk.b.as_ptr(), 255); blst_p2_mult(&mut p2, blst_p2_generator(), sk.b.as_ptr(), 255); blst_sk_inverse(&mut sk, &sk); blst_p2_mult(&mut p2, &p2, sk.b.as_ptr(), 255); } assert_eq!(p1, unsafe { *blst_p1_generator() }); assert_eq!(p2, unsafe { *blst_p2_generator() }); } } ================================================ FILE: bindings/rust/src/pippenger-no_std.rs ================================================ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 use core::ops::{Index, IndexMut}; use core::slice::SliceIndex; macro_rules! pippenger_mult_impl { ( $points:ident, $point:ty, $point_affine:ty, $to_affines:ident, $scratch_sizeof:ident, $multi_scalar_mult:ident, $tile_mult:ident, $add_or_double:ident, $double:ident, $test_mod:ident, $generator:ident, $mult:ident, $add:ident, $is_inf:ident, $in_group:ident, ) => { pub struct $points { points: Vec<$point_affine>, } impl> Index for $points { type Output = I::Output; #[inline] fn index(&self, i: I) -> &Self::Output { &self.points[i] } } impl> IndexMut for $points { #[inline] fn index_mut(&mut self, i: I) -> &mut Self::Output { &mut self.points[i] } } impl $points { #[inline] pub fn as_slice(&self) -> &[$point_affine] { self.points.as_slice() } pub fn from(points: &[$point]) -> Self { let npoints = points.len(); let mut ret = Self { points: Vec::with_capacity(npoints), }; #[allow(clippy::uninit_vec)] unsafe { ret.points.set_len(npoints) }; let p: [*const $point; 2] = [&points[0], ptr::null()]; unsafe { $to_affines(&mut ret.points[0], &p[0], npoints) }; ret } #[inline] pub fn mult(&self, scalars: &[u8], nbits: usize) -> $point { self.as_slice().mult(scalars, nbits) } #[inline] pub fn add(&self) -> $point { self.as_slice().add() } } impl MultiPoint for [$point_affine] { type Output = $point; fn mult(&self, scalars: &[u8], nbits: usize) -> $point { let npoints = self.len(); let nbytes = (nbits + 7) / 8; if scalars.len() < nbytes * npoints { panic!("scalars length mismatch"); } let p: [*const $point_affine; 2] = [&self[0], ptr::null()]; let s: [*const u8; 2] = [&scalars[0], ptr::null()]; let mut ret = <$point>::default(); unsafe { let mut scratch: Vec = Vec::with_capacity($scratch_sizeof(npoints) / 8); #[allow(clippy::uninit_vec)] scratch.set_len(scratch.capacity()); $multi_scalar_mult( &mut ret, &p[0], npoints, &s[0], nbits, &mut scratch[0], ); } ret } fn add(&self) -> $point { let npoints = self.len(); let p: [*const _; 2] = [&self[0], ptr::null()]; let mut ret = <$point>::default(); unsafe { $add(&mut ret, &p[0], npoints) }; ret } fn validate(&self) -> Result<(), BLST_ERROR> { for i in 0..self.len() { if unsafe { $is_inf(&self[i]) } { return Err(BLST_ERROR::BLST_PK_IS_INFINITY); } if !unsafe { $in_group(&self[i]) } { return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); } } Ok(()) } } #[cfg(test)] pippenger_test_mod!( $test_mod, $points, $point, $add_or_double, $generator, $mult, ); }; } #[cfg(test)] include!("pippenger-test_mod.rs"); pippenger_mult_impl!( p1_affines, blst_p1, blst_p1_affine, blst_p1s_to_affine, blst_p1s_mult_pippenger_scratch_sizeof, blst_p1s_mult_pippenger, blst_p1s_tile_pippenger, blst_p1_add_or_double, blst_p1_double, p1_multi_point, blst_p1_generator, blst_p1_mult, blst_p1s_add, blst_p1_affine_is_inf, blst_p1_affine_in_g1, ); pippenger_mult_impl!( p2_affines, blst_p2, blst_p2_affine, blst_p2s_to_affine, blst_p2s_mult_pippenger_scratch_sizeof, blst_p2s_mult_pippenger, blst_p2s_tile_pippenger, blst_p2_add_or_double, blst_p2_double, p2_multi_point, blst_p2_generator, blst_p2_mult, blst_p2s_add, blst_p2_affine_is_inf, blst_p2_affine_in_g2, ); ================================================ FILE: bindings/rust/src/pippenger-test_mod.rs ================================================ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 macro_rules! pippenger_test_mod { ( $test_mod:ident, $points:ident, $point:ty, $add_or_double:ident, $generator:ident, $mult:ident, ) => { mod $test_mod { use super::*; use rand::{RngCore, SeedableRng}; use rand_chacha::ChaCha20Rng; #[test] fn test_mult() { const npoints: usize = 2000; const nbits: usize = 160; const nbytes: usize = (nbits + 7) / 8; let mut scalars = Box::new([0u8; nbytes * npoints]); ChaCha20Rng::from_seed([0u8; 32]).fill_bytes(scalars.as_mut()); let mut points: Vec<$point> = Vec::with_capacity(npoints); unsafe { points.set_len(points.capacity()) }; let mut naive = <$point>::default(); for i in 0..npoints { unsafe { let mut t = <$point>::default(); $mult( &mut points[i], $generator(), &scalars[i * nbytes], core::cmp::min(32, nbits), ); $mult(&mut t, &points[i], &scalars[i * nbytes], nbits); $add_or_double(&mut naive, &naive, &t); } if i < 27 { let points = $points::from(&points[0..i + 1]); assert_eq!(naive, points.mult(scalars.as_ref(), nbits)); } } let points = $points::from(&points); assert_eq!(naive, points.mult(scalars.as_ref(), nbits)); } #[test] fn test_add() { const npoints: usize = 2000; const nbits: usize = 32; const nbytes: usize = (nbits + 7) / 8; let mut scalars = Box::new([0u8; nbytes * npoints]); ChaCha20Rng::from_seed([0u8; 32]).fill_bytes(scalars.as_mut()); let mut points: Vec<$point> = Vec::with_capacity(npoints); unsafe { points.set_len(points.capacity()) }; let mut naive = <$point>::default(); for i in 0..npoints { unsafe { $mult( &mut points[i], $generator(), &scalars[i * nbytes], 32, ); $add_or_double(&mut naive, &naive, &points[i]); } } let points = $points::from(&points); assert_eq!(naive, points.add()); } } }; } ================================================ FILE: bindings/rust/src/pippenger.rs ================================================ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 use core::num::Wrapping; use core::ops::{Index, IndexMut}; use core::slice::SliceIndex; use std::sync::Barrier; struct tile { x: usize, dx: usize, y: usize, dy: usize, } // Minimalist core::cell::Cell stand-in, but with Sync marker, which // makes it possible to pass it to multiple threads. It works, because // *here* each Cell is written only once and by just one thread. #[repr(transparent)] struct Cell { value: T, } unsafe impl Sync for Cell {} impl Cell { pub fn as_ptr(&self) -> *mut T { &self.value as *const T as *mut T } } macro_rules! pippenger_mult_impl { ( $points:ident, $point:ty, $point_affine:ty, $to_affines:ident, $scratch_sizeof:ident, $multi_scalar_mult:ident, $tile_mult:ident, $add_or_double:ident, $double:ident, $test_mod:ident, $generator:ident, $mult:ident, $add:ident, $is_inf:ident, $in_group:ident, $from_affine:ident, ) => { pub struct $points { points: Vec<$point_affine>, } impl> Index for $points { type Output = I::Output; #[inline] fn index(&self, i: I) -> &Self::Output { &self.points[i] } } impl> IndexMut for $points { #[inline] fn index_mut(&mut self, i: I) -> &mut Self::Output { &mut self.points[i] } } impl $points { #[inline] pub fn as_slice(&self) -> &[$point_affine] { self.points.as_slice() } pub fn from(points: &[$point]) -> Self { let npoints = points.len(); let mut ret = Self { points: Vec::with_capacity(npoints), }; unsafe { ret.points.set_len(npoints) }; let pool = mt::da_pool(); let ncpus = pool.max_count(); if ncpus < 2 || npoints < 768 { let p: [*const $point; 2] = [&points[0], ptr::null()]; unsafe { $to_affines(&mut ret.points[0], &p[0], npoints) }; return ret; } let mut nslices = (npoints + 511) / 512; nslices = core::cmp::min(nslices, ncpus); let wg = Arc::new((Barrier::new(2), AtomicUsize::new(nslices))); let (mut delta, mut rem) = (npoints / nslices + 1, Wrapping(npoints % nslices)); let mut x = 0usize; while x < npoints { let out = &mut ret.points[x]; let inp = &points[x]; delta -= (rem == Wrapping(0)) as usize; rem -= Wrapping(1); x += delta; let wg = wg.clone(); pool.joined_execute(move || { let p: [*const $point; 2] = [inp, ptr::null()]; unsafe { $to_affines(out, &p[0], delta) }; if wg.1.fetch_sub(1, Ordering::AcqRel) == 1 { wg.0.wait(); } }); } wg.0.wait(); ret } #[inline] pub fn mult(&self, scalars: &[u8], nbits: usize) -> $point { self.as_slice().mult(scalars, nbits) } #[inline] pub fn add(&self) -> $point { self.as_slice().add() } } impl MultiPoint for [$point_affine] { type Output = $point; fn mult(&self, scalars: &[u8], nbits: usize) -> $point { let npoints = self.len(); let nbytes = (nbits + 7) / 8; if scalars.len() < nbytes * npoints { panic!("scalars length mismatch"); } let pool = mt::da_pool(); let ncpus = pool.max_count(); if ncpus < 2 { let p: [*const $point_affine; 2] = [&self[0], ptr::null()]; let s: [*const u8; 2] = [&scalars[0], ptr::null()]; unsafe { let mut scratch: Vec = Vec::with_capacity($scratch_sizeof(npoints) / 8); #[allow(clippy::uninit_vec)] scratch.set_len(scratch.capacity()); let mut ret = <$point>::default(); $multi_scalar_mult( &mut ret, &p[0], npoints, &s[0], nbits, &mut scratch[0], ); return ret; } } if npoints < 32 { let counter = Arc::new(AtomicUsize::new(0)); let n_workers = core::cmp::min(ncpus, npoints); let (tx, rx) = sync_channel(n_workers); for _ in 0..n_workers { let tx = tx.clone(); let counter = counter.clone(); pool.joined_execute(move || { let mut acc = <$point>::default(); let mut tmp = <$point>::default(); let mut first = true; loop { let work = counter.fetch_add(1, Ordering::Relaxed); if work >= npoints { break; } unsafe { $from_affine(&mut tmp, &self[work]); let scalar = &scalars[nbytes * work]; if first { $mult(&mut acc, &tmp, scalar, nbits); first = false; } else { $mult(&mut tmp, &tmp, scalar, nbits); $add_or_double(&mut acc, &acc, &tmp); } } } tx.send(acc).expect("disaster"); }); } let mut ret = rx.recv().expect("disaster"); for _ in 1..n_workers { let p = rx.recv().expect("disaster"); unsafe { $add_or_double(&mut ret, &ret, &p) }; } return ret; } let (nx, ny, window) = breakdown(nbits, pippenger_window_size(npoints), ncpus); // |grid[]| holds "coordinates" and place for result let mut grid: Vec<(tile, Cell<$point>)> = Vec::with_capacity(nx * ny); #[allow(clippy::uninit_vec)] unsafe { grid.set_len(grid.capacity()) }; let dx = npoints / nx; let mut y = window * (ny - 1); let mut total = 0usize; while total < nx { grid[total].0.x = total * dx; grid[total].0.dx = dx; grid[total].0.y = y; grid[total].0.dy = nbits - y; total += 1; } grid[total - 1].0.dx = npoints - grid[total - 1].0.x; while y != 0 { y -= window; for i in 0..nx { grid[total].0.x = grid[i].0.x; grid[total].0.dx = grid[i].0.dx; grid[total].0.y = y; grid[total].0.dy = window; total += 1; } } let grid = &grid[..]; let points = &self[..]; let sz = unsafe { $scratch_sizeof(0) / 8 }; let mut row_sync: Vec = Vec::with_capacity(ny); row_sync.resize_with(ny, Default::default); let row_sync = Arc::new(row_sync); let counter = Arc::new(AtomicUsize::new(0)); let n_workers = core::cmp::min(ncpus, total); let (tx, rx) = sync_channel(n_workers); for _ in 0..n_workers { let tx = tx.clone(); let counter = counter.clone(); let row_sync = row_sync.clone(); pool.joined_execute(move || { let mut scratch = vec![0u64; sz << (window - 1)]; let mut p: [*const $point_affine; 2] = [ptr::null(), ptr::null()]; let mut s: [*const u8; 2] = [ptr::null(), ptr::null()]; loop { let work = counter.fetch_add(1, Ordering::Relaxed); if work >= total { break; } let x = grid[work].0.x; let y = grid[work].0.y; p[0] = &points[x]; s[0] = &scalars[x * nbytes]; unsafe { $tile_mult( grid[work].1.as_ptr(), &p[0], grid[work].0.dx, &s[0], nbits, &mut scratch[0], y, window, ); } if row_sync[y / window] .fetch_add(1, Ordering::AcqRel) == nx - 1 { tx.send(y).expect("disaster"); } } }); } let mut ret = <$point>::default(); let mut rows = vec![false; ny]; let mut row = 0usize; for _ in 0..ny { let mut y = rx.recv().unwrap(); rows[y / window] = true; while grid[row].0.y == y { while row < total && grid[row].0.y == y { unsafe { $add_or_double( &mut ret, &ret, grid[row].1.as_ptr(), ); } row += 1; } if y == 0 { break; } for _ in 0..window { unsafe { $double(&mut ret, &ret) }; } y -= window; if !rows[y / window] { break; } } } ret } fn add(&self) -> $point { let npoints = self.len(); let pool = mt::da_pool(); let ncpus = pool.max_count(); if ncpus < 2 || npoints < 384 { let p: [*const _; 2] = [&self[0], ptr::null()]; let mut ret = <$point>::default(); unsafe { $add(&mut ret, &p[0], npoints) }; return ret; } let counter = Arc::new(AtomicUsize::new(0)); let nchunks = (npoints + 255) / 256; let chunk = npoints / nchunks + 1; let n_workers = core::cmp::min(ncpus, nchunks); let (tx, rx) = sync_channel(n_workers); for _ in 0..n_workers { let tx = tx.clone(); let counter = counter.clone(); pool.joined_execute(move || { let mut acc = <$point>::default(); let mut chunk = chunk; let mut p: [*const _; 2] = [ptr::null(), ptr::null()]; loop { let work = counter.fetch_add(chunk, Ordering::Relaxed); if work >= npoints { break; } p[0] = &self[work]; if work + chunk > npoints { chunk = npoints - work; } unsafe { let mut t = MaybeUninit::<$point>::uninit(); $add(t.as_mut_ptr(), &p[0], chunk); $add_or_double(&mut acc, &acc, t.as_ptr()); }; } tx.send(acc).expect("disaster"); }); } let mut ret = rx.recv().unwrap(); for _ in 1..n_workers { unsafe { $add_or_double(&mut ret, &ret, &rx.recv().unwrap()) }; } ret } fn validate(&self) -> Result<(), BLST_ERROR> { fn check(point: &$point_affine) -> Result<(), BLST_ERROR> { if unsafe { $is_inf(point) } { return Err(BLST_ERROR::BLST_PK_IS_INFINITY); } if !unsafe { $in_group(point) } { return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); } Ok(()) } let npoints = self.len(); let pool = mt::da_pool(); let n_workers = core::cmp::min(npoints, pool.max_count()); if n_workers < 2 { for i in 0..npoints { check(&self[i])? } return Ok(()) } let counter = Arc::new(AtomicUsize::new(0)); let valid = Arc::new(AtomicBool::new(true)); let wg = Arc::new((Barrier::new(2), AtomicUsize::new(n_workers))); for _ in 0..n_workers { let counter = counter.clone(); let valid = valid.clone(); let wg = wg.clone(); pool.joined_execute(move || { while valid.load(Ordering::Relaxed) { let work = counter.fetch_add(1, Ordering::Relaxed); if work >= npoints { break; } if check(&self[work]).is_err() { valid.store(false, Ordering::Relaxed); break; } } if wg.1.fetch_sub(1, Ordering::AcqRel) == 1 { wg.0.wait(); } }); } wg.0.wait(); if valid.load(Ordering::Relaxed) { return Ok(()); } else { return Err(BLST_ERROR::BLST_POINT_NOT_IN_GROUP); } } } #[cfg(test)] pippenger_test_mod!( $test_mod, $points, $point, $add_or_double, $generator, $mult, ); }; } #[cfg(test)] include!("pippenger-test_mod.rs"); pippenger_mult_impl!( p1_affines, blst_p1, blst_p1_affine, blst_p1s_to_affine, blst_p1s_mult_pippenger_scratch_sizeof, blst_p1s_mult_pippenger, blst_p1s_tile_pippenger, blst_p1_add_or_double, blst_p1_double, p1_multi_point, blst_p1_generator, blst_p1_mult, blst_p1s_add, blst_p1_affine_is_inf, blst_p1_affine_in_g1, blst_p1_from_affine, ); pippenger_mult_impl!( p2_affines, blst_p2, blst_p2_affine, blst_p2s_to_affine, blst_p2s_mult_pippenger_scratch_sizeof, blst_p2s_mult_pippenger, blst_p2s_tile_pippenger, blst_p2_add_or_double, blst_p2_double, p2_multi_point, blst_p2_generator, blst_p2_mult, blst_p2s_add, blst_p2_affine_is_inf, blst_p2_affine_in_g2, blst_p2_from_affine, ); fn num_bits(l: usize) -> usize { 8 * core::mem::size_of_val(&l) - l.leading_zeros() as usize } fn breakdown( nbits: usize, window: usize, ncpus: usize, ) -> (usize, usize, usize) { let mut nx: usize; let mut wnd: usize; if nbits > window * ncpus { nx = 1; wnd = num_bits(ncpus / 4); if (window + wnd) > 18 { wnd = window - wnd; } else { wnd = (nbits / window + ncpus - 1) / ncpus; if (nbits / (window + 1) + ncpus - 1) / ncpus < wnd { wnd = window + 1; } else { wnd = window; } } } else { nx = 2; wnd = window - 2; while (nbits / wnd + 1) * nx < ncpus { nx += 1; wnd = window - num_bits(3 * nx / 2); } nx -= 1; wnd = window - num_bits(3 * nx / 2); } let ny = nbits / wnd + 1; wnd = nbits / ny + 1; (nx, ny, wnd) } fn pippenger_window_size(npoints: usize) -> usize { let wbits = num_bits(npoints); if wbits > 13 { return wbits - 4; } if wbits > 5 { return wbits - 3; } 2 } ================================================ FILE: bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_NU_.json ================================================ { "L": "0x40", "Z": "0xb", "ciphersuite": "BLS12381G1_XMD:SHA-256_SSWU_NU_", "curve": "BLS12-381 G1", "dst": "QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_NU_", "expand": "XMD", "field": { "m": "0x1", "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" }, "hash": "sha256", "k": "0x80", "map": { "name": "SSWU" }, "randomOracle": false, "vectors": [ { "P": { "x": "0x184bb665c37ff561a89ec2122dd343f20e0f4cbcaec84e3c3052ea81d1834e192c426074b02ed3dca4e7676ce4ce48ba", "y": "0x04407b8d35af4dacc809927071fc0405218f1401a6d15af775810e4e460064bcc9468beeba82fdc751be70476c888bf3" }, "Q": { "x": "0x11398d3b324810a1b093f8e35aa8571cced95858207e7f49c4fd74656096d61d8a2f9a23cdb18a4dd11cd1d66f41f709", "y": "0x19316b6fb2ba7717355d5d66a361899057e1e84a6823039efc7beccefe09d023fb2713b1c415fcf278eb0c39a89b4f72" }, "msg": "", "u": [ "0x156c8a6a2c184569d69a76be144b5cdc5141d2d2ca4fe341f011e25e3969c55ad9e9b9ce2eb833c81a908e5fa4ac5f03" ] }, { "P": { "x": "0x009769f3ab59bfd551d53a5f846b9984c59b97d6842b20a2c565baa167945e3d026a3755b6345df8ec7e6acb6868ae6d", "y": "0x1532c00cf61aa3d0ce3e5aa20c3b531a2abd2c770a790a2613818303c6b830ffc0ecf6c357af3317b9575c567f11cd2c" }, "Q": { "x": "0x1998321bc27ff6d71df3051b5aec12ff47363d81a5e9d2dff55f444f6ca7e7d6af45c56fd029c58237c266ef5cda5254", "y": "0x034d274476c6307ae584f951c82e7ea85b84f72d28f4d6471732356121af8d62a49bc263e8eb913a6cf6f125995514ee" }, "msg": "abc", "u": [ "0x147e1ed29f06e4c5079b9d14fc89d2820d32419b990c1c7bb7dbea2a36a045124b31ffbde7c99329c05c559af1c6cc82" ] }, { "P": { "x": "0x1974dbb8e6b5d20b84df7e625e2fbfecb2cdb5f77d5eae5fb2955e5ce7313cae8364bc2fff520a6c25619739c6bdcb6a", "y": "0x15f9897e11c6441eaa676de141c8d83c37aab8667173cbe1dfd6de74d11861b961dccebcd9d289ac633455dfcc7013a3" }, "Q": { "x": "0x17d502fa43bd6a4cad2859049a0c3ecefd60240d129be65da271a4c03a9c38fa78163b9d2a919d2beb57df7d609b4919", "y": "0x109019902ae93a8732abecf2ff7fecd2e4e305eb91f41c9c3267f16b6c19de138c7272947f25512745da6c466cdfd1ac" }, "msg": "abcdef0123456789", "u": [ "0x04090815ad598a06897dd89bcda860f25837d54e897298ce31e6947378134d3761dc59a572154963e8c954919ecfa82d" ] }, { "P": { "x": "0x0a7a047c4a8397b3446450642c2ac64d7239b61872c9ae7a59707a8f4f950f101e766afe58223b3bff3a19a7f754027c", "y": "0x1383aebba1e4327ccff7cf9912bda0dbc77de048b71ef8c8a81111d71dc33c5e3aa6edee9cf6f5fe525d50cc50b77cc9" }, "Q": { "x": "0x112eb92dd2b3aa9cd38b08de4bef603f2f9fb0ca226030626a9a2e47ad1e9847fe0a5ed13766c339e38f514bba143b21", "y": "0x17542ce2f8d0a54f2c5ba8c4b14e10b22d5bcd7bae2af3c965c8c872b571058c720eac448276c99967ded2bf124490e1" }, "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "u": [ "0x08dccd088ca55b8bfbc96fb50bb25c592faa867a8bb78d4e94a8cc2c92306190244532e91feba2b7fed977e3c3bb5a1f" ] }, { "P": { "x": "0x0e7a16a975904f131682edbb03d9560d3e48214c9986bd50417a77108d13dc957500edf96462a3d01e62dc6cd468ef11", "y": "0x0ae89e677711d05c30a48d6d75e76ca9fb70fe06c6dd6ff988683d89ccde29ac7d46c53bb97a59b1901abf1db66052db" }, "Q": { "x": "0x1775d400a1bacc1c39c355da7e96d2d1c97baa9430c4a3476881f8521c09a01f921f592607961efc99c4cd46bd78ca19", "y": "0x1109b5d59f65964315de65a7a143e86eabc053104ed289cf480949317a5685fad7254ff8e7fe6d24d3104e5d55ad6370" }, "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "u": [ "0x0dd824886d2123a96447f6c56e3a3fa992fbfefdba17b6673f9f630ff19e4d326529db37e1c1be43f905bf9202e0278d" ] } ] } ================================================ FILE: bindings/vectors/hash_to_curve/BLS12381G1_XMD_SHA-256_SSWU_RO_.json ================================================ { "L": "0x40", "Z": "0xb", "ciphersuite": "BLS12381G1_XMD:SHA-256_SSWU_RO_", "curve": "BLS12-381 G1", "dst": "QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_", "expand": "XMD", "field": { "m": "0x1", "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" }, "hash": "sha256", "k": "0x80", "map": { "name": "SSWU" }, "randomOracle": true, "vectors": [ { "P": { "x": "0x052926add2207b76ca4fa57a8734416c8dc95e24501772c814278700eed6d1e4e8cf62d9c09db0fac349612b759e79a1", "y": "0x08ba738453bfed09cb546dbb0783dbb3a5f1f566ed67bb6be0e8c67e2e81a4cc68ee29813bb7994998f3eae0c9c6a265" }, "Q0": { "x": "0x11a3cce7e1d90975990066b2f2643b9540fa40d6137780df4e753a8054d07580db3b7f1f03396333d4a359d1fe3766fe", "y": "0x0eeaf6d794e479e270da10fdaf768db4c96b650a74518fc67b04b03927754bac66f3ac720404f339ecdcc028afa091b7" }, "Q1": { "x": "0x160003aaf1632b13396dbad518effa00fff532f604de1a7fc2082ff4cb0afa2d63b2c32da1bef2bf6c5ca62dc6b72f9c", "y": "0x0d8bb2d14e20cf9f6036152ed386d79189415b6d015a20133acb4e019139b94e9c146aaad5817f866c95d609a361735e" }, "msg": "", "u": [ "0x0ba14bd907ad64a016293ee7c2d276b8eae71f25a4b941eece7b0d89f17f75cb3ae5438a614fb61d6835ad59f29c564f", "0x019b9bd7979f12657976de2884c7cce192b82c177c80e0ec604436a7f538d231552f0d96d9f7babe5fa3b19b3ff25ac9" ] }, { "P": { "x": "0x03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903", "y": "0x0b9c15f3fe6e5cf4211f346271d7b01c8f3b28be689c8429c85b67af215533311f0b8dfaaa154fa6b88176c229f2885d" }, "Q0": { "x": "0x125435adce8e1cbd1c803e7123f45392dc6e326d292499c2c45c5865985fd74fe8f042ecdeeec5ecac80680d04317d80", "y": "0x0e8828948c989126595ee30e4f7c931cbd6f4570735624fd25aef2fa41d3f79cfb4b4ee7b7e55a8ce013af2a5ba20bf2" }, "Q1": { "x": "0x11def93719829ecda3b46aa8c31fc3ac9c34b428982b898369608e4f042babee6c77ab9218aad5c87ba785481eff8ae4", "y": "0x0007c9cef122ccf2efd233d6eb9bfc680aa276652b0661f4f820a653cec1db7ff69899f8e52b8e92b025a12c822a6ce6" }, "msg": "abc", "u": [ "0x0d921c33f2bad966478a03ca35d05719bdf92d347557ea166e5bba579eea9b83e9afa5c088573c2281410369fbd32951", "0x003574a00b109ada2f26a37a91f9d1e740dffd8d69ec0c35e1e9f4652c7dba61123e9dd2e76c655d956e2b3462611139" ] }, { "P": { "x": "0x11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98", "y": "0x03a87ae2caf14e8ee52e51fa2ed8eefe80f02457004ba4d486d6aa1f517c0889501dc7413753f9599b099ebcbbd2d709" }, "Q0": { "x": "0x08834484878c217682f6d09a4b51444802fdba3d7f2df9903a0ddadb92130ebbfa807fffa0eabf257d7b48272410afff", "y": "0x0b318f7ecf77f45a0f038e62d7098221d2dbbca2a394164e2e3fe953dc714ac2cde412d8f2d7f0c03b259e6795a2508e" }, "Q1": { "x": "0x158418ed6b27e2549f05531a8281b5822b31c3bf3144277fbb977f8d6e2694fedceb7011b3c2b192f23e2a44b2bd106e", "y": "0x1879074f344471fac5f839e2b4920789643c075792bec5af4282c73f7941cda5aa77b00085eb10e206171b9787c4169f" }, "msg": "abcdef0123456789", "u": [ "0x062d1865eb80ebfa73dcfc45db1ad4266b9f3a93219976a3790ab8d52d3e5f1e62f3b01795e36834b17b70e7b76246d4", "0x0cdc3e2f271f29c4ff75020857ce6c5d36008c9b48385ea2f2bf6f96f428a3deb798aa033cd482d1cdc8b30178b08e3a" ] }, { "P": { "x": "0x15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488", "y": "0x1807a1d50c29f430b8cafc4f8638dfeeadf51211e1602a5f184443076715f91bb90a48ba1e370edce6ae1062f5e6dd38" }, "Q0": { "x": "0x0cbd7f84ad2c99643fea7a7ac8f52d63d66cefa06d9a56148e58b984b3dd25e1f41ff47154543343949c64f88d48a710", "y": "0x052c00e4ed52d000d94881a5638ae9274d3efc8bc77bc0e5c650de04a000b2c334a9e80b85282a00f3148dfdface0865" }, "Q1": { "x": "0x06493fb68f0d513af08be0372f849436a787e7b701ae31cb964d968021d6ba6bd7d26a38aaa5a68e8c21a6b17dc8b579", "y": "0x02e98f2ccf5802b05ffaac7c20018bc0c0b2fd580216c4aa2275d2909dc0c92d0d0bdc979226adeb57a29933536b6bb4" }, "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "u": [ "0x010476f6a060453c0b1ad0b628f3e57c23039ee16eea5e71bb87c3b5419b1255dc0e5883322e563b84a29543823c0e86", "0x0b1a912064fb0554b180e07af7e787f1f883a0470759c03c1b6509eb8ce980d1670305ae7b928226bb58fdc0a419f46e" ] }, { "P": { "x": "0x082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe", "y": "0x05b84ae5a942248eea39e1d91030458c40153f3b654ab7872d779ad1e942856a20c438e8d99bc8abfbf74729ce1f7ac8" }, "Q0": { "x": "0x0cf97e6dbd0947857f3e578231d07b309c622ade08f2c08b32ff372bd90db19467b2563cc997d4407968d4ac80e154f8", "y": "0x127f0cddf2613058101a5701f4cb9d0861fd6c2a1b8e0afe194fccf586a3201a53874a2761a9ab6d7220c68661a35ab3" }, "Q1": { "x": "0x092f1acfa62b05f95884c6791fba989bbe58044ee6355d100973bf9553ade52b47929264e6ae770fb264582d8dce512a", "y": "0x028e6d0169a72cfedb737be45db6c401d3adfb12c58c619c82b93a5dfcccef12290de530b0480575ddc8397cda0bbebf" }, "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "u": [ "0x0a8ffa7447f6be1c5a2ea4b959c9454b431e29ccc0802bc052413a9c5b4f9aac67a93431bd480d15be1e057c8a08e8c6", "0x05d487032f602c90fa7625dbafe0f4a49ef4a6b0b33d7bb349ff4cf5410d297fd6241876e3e77b651cfc8191e40a68b7" ] } ] } ================================================ FILE: bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_NU_.json ================================================ { "L": "0x40", "Z": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9,0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaaa", "ciphersuite": "BLS12381G2_XMD:SHA-256_SSWU_NU_", "curve": "BLS12-381 G2", "dst": "QUUX-V01-CS02-with-BLS12381G2_XMD:SHA-256_SSWU_NU_", "expand": "XMD", "field": { "m": "0x2", "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" }, "hash": "sha256", "k": "0x80", "map": { "name": "SSWU" }, "randomOracle": false, "vectors": [ { "P": { "x": "0x00e7f4568a82b4b7dc1f14c6aaa055edf51502319c723c4dc2688c7fe5944c213f510328082396515734b6612c4e7bb7,0x126b855e9e69b1f691f816e48ac6977664d24d99f8724868a184186469ddfd4617367e94527d4b74fc86413483afb35b", "y": "0x0caead0fd7b6176c01436833c79d305c78be307da5f6af6c133c47311def6ff1e0babf57a0fb5539fce7ee12407b0a42,0x1498aadcf7ae2b345243e281ae076df6de84455d766ab6fcdaad71fab60abb2e8b980a440043cd305db09d283c895e3d" }, "Q": { "x": "0x18ed3794ad43c781816c523776188deafba67ab773189b8f18c49bc7aa841cd81525171f7a5203b2a340579192403bef,0x0727d90785d179e7b5732c8a34b660335fed03b913710b60903cf4954b651ed3466dc3728e21855ae822d4a0f1d06587", "y": "0x00764a5cf6c5f61c52c838523460eb2168b5a5b43705e19cb612e006f29b717897facfd15dd1c8874c915f6d53d0342d,0x19290bb9797c12c1d275817aa2605ebe42275b66860f0e4d04487ebc2e47c50b36edd86c685a60c20a2bd584a82b011a" }, "msg": "", "u": [ "0x07355d25caf6e7f2f0cb2812ca0e513bd026ed09dda65b177500fa31714e09ea0ded3a078b526bed3307f804d4b93b04,0x02829ce3c021339ccb5caf3e187f6370e1e2a311dec9b75363117063ab2015603ff52c3d3b98f19c2f65575e99e8b78c" ] }, { "P": { "x": "0x108ed59fd9fae381abfd1d6bce2fd2fa220990f0f837fa30e0f27914ed6e1454db0d1ee957b219f61da6ff8be0d6441f,0x0296238ea82c6d4adb3c838ee3cb2346049c90b96d602d7bb1b469b905c9228be25c627bffee872def773d5b2a2eb57d", "y": "0x033f90f6057aadacae7963b0a0b379dd46750c1c94a6357c99b65f63b79e321ff50fe3053330911c56b6ceea08fee656,0x153606c417e59fb331b7ae6bce4fbf7c5190c33ce9402b5ebe2b70e44fca614f3f1382a3625ed5493843d0b0a652fc3f" }, "Q": { "x": "0x0f40e1d5025ecef0d850aa0bb7bbeceab21a3d4e85e6bee857805b09693051f5b25428c6be343edba5f14317fcc30143,0x02e0d261f2b9fee88b82804ec83db330caa75fbb12719cfa71ccce1c532dc4e1e79b0a6a281ed8d3817524286c8bc04c", "y": "0x0cf4a4adc5c66da0bca4caddc6a57ecd97c8252d7526a8ff478e0dfed816c4d321b5c3039c6683ae9b1e6a3a38c9c0ae,0x11cad1646bb3768c04be2ab2bbe1f80263b7ff6f8f9488f5bc3b6850e5a3e97e20acc583613c69cf3d2bfe8489744ebb" }, "msg": "abc", "u": [ "0x138879a9559e24cecee8697b8b4ad32cced053138ab913b99872772dc753a2967ed50aabc907937aefb2439ba06cc50c,0x0a1ae7999ea9bab1dcc9ef8887a6cb6e8f1e22566015428d220b7eec90ffa70ad1f624018a9ad11e78d588bd3617f9f2" ] }, { "P": { "x": "0x038af300ef34c7759a6caaa4e69363cafeed218a1f207e93b2c70d91a1263d375d6730bd6b6509dcac3ba5b567e85bf3,0x0da75be60fb6aa0e9e3143e40c42796edf15685cafe0279afd2a67c3dff1c82341f17effd402e4f1af240ea90f4b659b", "y": "0x19b148cbdf163cf0894f29660d2e7bfb2b68e37d54cc83fd4e6e62c020eaa48709302ef8e746736c0e19342cc1ce3df4,0x0492f4fed741b073e5a82580f7c663f9b79e036b70ab3e51162359cec4e77c78086fe879b65ca7a47d34374c8315ac5e" }, "Q": { "x": "0x13a9d4a738a85c9f917c7be36b240915434b58679980010499b9ae8d7a1bf7fbe617a15b3cd6060093f40d18e0f19456,0x16fa88754e7670366a859d6f6899ad765bf5a177abedb2740aacc9252c43f90cd0421373fbd5b2b76bb8f5c4886b5d37", "y": "0x0a7fa7d82c46797039398253e8765a4194100b330dfed6d7fbb46d6fbf01e222088779ac336e3675c7a7a0ee05bbb6e3,0x0c6ee170ab766d11fa9457cef53253f2628010b2cffc102b3b28351eb9df6c281d3cfc78e9934769d661b72a5265338d" }, "msg": "abcdef0123456789", "u": [ "0x18c16fe362b7dbdfa102e42bdfd3e2f4e6191d479437a59db4eb716986bf08ee1f42634db66bde97d6c16bbfd342b3b8,0x0e37812ce1b146d998d5f92bdd5ada2a31bfd63dfe18311aa91637b5f279dd045763166aa1615e46a50d8d8f475f184e" ] }, { "P": { "x": "0x0c5ae723be00e6c3f0efe184fdc0702b64588fe77dda152ab13099a3bacd3876767fa7bbad6d6fd90b3642e902b208f9,0x12c8c05c1d5fc7bfa847f4d7d81e294e66b9a78bc9953990c358945e1f042eedafce608b67fdd3ab0cb2e6e263b9b1ad", "y": "0x04e77ddb3ede41b5ec4396b7421dd916efc68a358a0d7425bddd253547f2fb4830522358491827265dfc5bcc1928a569,0x11c624c56dbe154d759d021eec60fab3d8b852395a89de497e48504366feedd4662d023af447d66926a28076813dd646" }, "Q": { "x": "0x0a08b2f639855dfdeaaed972702b109e2241a54de198b2b4cd12ad9f88fa419a6086a58d91fc805de812ea29bee427c2,0x04a7442e4cb8b42ef0f41dac9ee74e65ecad3ce0851f0746dc47568b0e7a8134121ed09ba054509232c49148aef62cda", "y": "0x05d60b1f04212b2c87607458f71d770f43973511c260f0540eef3a565f42c7ce59aa1cea684bb2a7bcab84acd2f36c8c,0x1017aa5747ba15505ece266a86b0ca9c712f41a254b76ca04094ca442ce45ecd224bd5544cd16685d0d1b9d156dd0531" }, "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "u": [ "0x08d4a0997b9d52fecf99427abb721f0fa779479963315fe21c6445250de7183e3f63bfdf86570da8929489e421d4ee95,0x16cb4ccad91ec95aab070f22043916cd6a59c4ca94097f7f510043d48515526dc8eaaea27e586f09151ae613688d5a89" ] }, { "P": { "x": "0x0ea4e7c33d43e17cc516a72f76437c4bf81d8f4eac69ac355d3bf9b71b8138d55dc10fd458be115afa798b55dac34be1,0x1565c2f625032d232f13121d3cfb476f45275c303a037faa255f9da62000c2c864ea881e2bcddd111edc4a3c0da3e88d", "y": "0x043b6f5fe4e52c839148dc66f2b3751e69a0f6ebb3d056d6465d50d4108543ecd956e10fa1640dfd9bc0030cc2558d28,0x0f8991d2a1ad662e7b6f58ab787947f1fa607fce12dde171bc17903b012091b657e15333e11701edcf5b63ba2a561247" }, "Q": { "x": "0x19592c812d5a50c5601062faba14c7d670711745311c879de1235a0a11c75aab61327bf2d1725db07ec4d6996a682886,0x0eef4fa41ddc17ed47baf447a2c498548f3c72a02381313d13bef916e240b61ce125539090d62d9fbb14a900bf1b8e90", "y": "0x1260d6e0987eae96af9ebe551e08de22b37791d53f4db9e0d59da736e66699735793e853e26362531fe4adf99c1883e3,0x0dbace5df0a4ac4ac2f45d8fdf8aee45484576fdd6efc4f98ab9b9f4112309e628255e183022d98ea5ed6e47ca00306c" }, "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "u": [ "0x03f80ce4ff0ca2f576d797a3660e3f65b274285c054feccc3215c879e2c0589d376e83ede13f93c32f05da0f68fd6a10,0x006488a837c5413746d868d1efb7232724da10eca410b07d8b505b9363bdccf0a1fc0029bad07d65b15ccfe6dd25e20d" ] } ] } ================================================ FILE: bindings/vectors/hash_to_curve/BLS12381G2_XMD_SHA-256_SSWU_RO_.json ================================================ { "L": "0x40", "Z": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9,0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaaa", "ciphersuite": "BLS12381G2_XMD:SHA-256_SSWU_RO_", "curve": "BLS12-381 G2", "dst": "QUUX-V01-CS02-with-BLS12381G2_XMD:SHA-256_SSWU_RO_", "expand": "XMD", "field": { "m": "0x2", "p": "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab" }, "hash": "sha256", "k": "0x80", "map": { "name": "SSWU" }, "randomOracle": true, "vectors": [ { "P": { "x": "0x0141ebfbdca40eb85b87142e130ab689c673cf60f1a3e98d69335266f30d9b8d4ac44c1038e9dcdd5393faf5c41fb78a,0x05cb8437535e20ecffaef7752baddf98034139c38452458baeefab379ba13dff5bf5dd71b72418717047f5b0f37da03d", "y": "0x0503921d7f6a12805e72940b963c0cf3471c7b2a524950ca195d11062ee75ec076daf2d4bc358c4b190c0c98064fdd92,0x12424ac32561493f3fe3c260708a12b7c620e7be00099a974e259ddc7d1f6395c3c811cdd19f1e8dbf3e9ecfdcbab8d6" }, "Q0": { "x": "0x019ad3fc9c72425a998d7ab1ea0e646a1f6093444fc6965f1cad5a3195a7b1e099c050d57f45e3fa191cc6d75ed7458c,0x171c88b0b0efb5eb2b88913a9e74fe111a4f68867b59db252ce5868af4d1254bfab77ebde5d61cd1a86fb2fe4a5a1c1d", "y": "0x0ba10604e62bdd9eeeb4156652066167b72c8d743b050fb4c1016c31b505129374f76e03fa127d6a156213576910fef3,0x0eb22c7a543d3d376e9716a49b72e79a89c9bfe9feee8533ed931cbb5373dde1fbcd7411d8052e02693654f71e15410a" }, "Q1": { "x": "0x113d2b9cd4bd98aee53470b27abc658d91b47a78a51584f3d4b950677cfb8a3e99c24222c406128c91296ef6b45608be,0x13855912321c5cb793e9d1e88f6f8d342d49c0b0dbac613ee9e17e3c0b3c97dfbb5a49cc3fb45102fdbaf65e0efe2632", "y": "0x0fd3def0b7574a1d801be44fde617162aa2e89da47f464317d9bb5abc3a7071763ce74180883ad7ad9a723a9afafcdca,0x056f617902b3c0d0f78a9a8cbda43a26b65f602f8786540b9469b060db7b38417915b413ca65f875c130bebfaa59790c" }, "msg": "", "u": [ "0x03dbc2cce174e91ba93cbb08f26b917f98194a2ea08d1cce75b2b9cc9f21689d80bd79b594a613d0a68eb807dfdc1cf8,0x05a2acec64114845711a54199ea339abd125ba38253b70a92c876df10598bd1986b739cad67961eb94f7076511b3b39a", "0x02f99798e8a5acdeed60d7e18e9120521ba1f47ec090984662846bc825de191b5b7641148c0dbc237726a334473eee94,0x145a81e418d4010cc027a68f14391b30074e89e60ee7a22f87217b2f6eb0c4b94c9115b436e6fa4607e95a98de30a435" ] }, { "P": { "x": "0x02c2d18e033b960562aae3cab37a27ce00d80ccd5ba4b7fe0e7a210245129dbec7780ccc7954725f4168aff2787776e6,0x139cddbccdc5e91b9623efd38c49f81a6f83f175e80b06fc374de9eb4b41dfe4ca3a230ed250fbe3a2acf73a41177fd8", "y": "0x1787327b68159716a37440985269cf584bcb1e621d3a7202be6ea05c4cfe244aeb197642555a0645fb87bf7466b2ba48,0x00aa65dae3c8d732d10ecd2c50f8a1baf3001578f71c694e03866e9f3d49ac1e1ce70dd94a733534f106d4cec0eddd16" }, "Q0": { "x": "0x12b2e525281b5f4d2276954e84ac4f42cf4e13b6ac4228624e17760faf94ce5706d53f0ca1952f1c5ef75239aeed55ad,0x05d8a724db78e570e34100c0bc4a5fa84ad5839359b40398151f37cff5a51de945c563463c9efbdda569850ee5a53e77", "y": "0x02eacdc556d0bdb5d18d22f23dcb086dd106cad713777c7e6407943edbe0b3d1efe391eedf11e977fac55f9b94f2489c,0x04bbe48bfd5814648d0b9e30f0717b34015d45a861425fabc1ee06fdfce36384ae2c808185e693ae97dcde118f34de41" }, "Q1": { "x": "0x19f18cc5ec0c2f055e47c802acc3b0e40c337256a208001dde14b25afced146f37ea3d3ce16834c78175b3ed61f3c537,0x15b0dadc256a258b4c68ea43605dffa6d312eef215c19e6474b3e101d33b661dfee43b51abbf96fee68fc6043ac56a58", "y": "0x05e47c1781286e61c7ade887512bd9c2cb9f640d3be9cf87ea0bad24bd0ebfe946497b48a581ab6c7d4ca74b5147287f,0x19f98db2f4a1fcdf56a9ced7b320ea9deecf57c8e59236b0dc21f6ee7229aa9705ce9ac7fe7a31c72edca0d92370c096" }, "msg": "abc", "u": [ "0x15f7c0aa8f6b296ab5ff9c2c7581ade64f4ee6f1bf18f55179ff44a2cf355fa53dd2a2158c5ecb17d7c52f63e7195771,0x01c8067bf4c0ba709aa8b9abc3d1cef589a4758e09ef53732d670fd8739a7274e111ba2fcaa71b3d33df2a3a0c8529dd", "0x187111d5e088b6b9acfdfad078c4dacf72dcd17ca17c82be35e79f8c372a693f60a033b461d81b025864a0ad051a06e4,0x08b852331c96ed983e497ebc6dee9b75e373d923b729194af8e72a051ea586f3538a6ebb1e80881a082fa2b24df9f566" ] }, { "P": { "x": "0x121982811d2491fde9ba7ed31ef9ca474f0e1501297f68c298e9f4c0028add35aea8bb83d53c08cfc007c1e005723cd0,0x190d119345b94fbd15497bcba94ecf7db2cbfd1e1fe7da034d26cbba169fb3968288b3fafb265f9ebd380512a71c3f2c", "y": "0x05571a0f8d3c08d094576981f4a3b8eda0a8e771fcdcc8ecceaf1356a6acf17574518acb506e435b639353c2e14827c8,0x0bb5e7572275c567462d91807de765611490205a941a5a6af3b1691bfe596c31225d3aabdf15faff860cb4ef17c7c3be" }, "Q0": { "x": "0x0f48f1ea1318ddb713697708f7327781fb39718971d72a9245b9731faaca4dbaa7cca433d6c434a820c28b18e20ea208,0x06051467c8f85da5ba2540974758f7a1e0239a5981de441fdd87680a995649c211054869c50edbac1f3a86c561ba3162", "y": "0x168b3d6df80069dbbedb714d41b32961ad064c227355e1ce5fac8e105de5e49d77f0c64867f3834848f152497eb76333,0x134e0e8331cee8cb12f9c2d0742714ed9eee78a84d634c9a95f6a7391b37125ed48bfc6e90bf3546e99930ff67cc97bc" }, "Q1": { "x": "0x004fd03968cd1c99a0dd84551f44c206c84dcbdb78076c5bfee24e89a92c8508b52b88b68a92258403cbe1ea2da3495f,0x1674338ea298281b636b2eb0fe593008d03171195fd6dcd4531e8a1ed1f02a72da238a17a635de307d7d24aa2d969a47", "y": "0x0dc7fa13fff6b12558419e0a1e94bfc3cfaf67238009991c5f24ee94b632c3d09e27eca329989aee348a67b50d5e236c,0x169585e164c131103d85324f2d7747b23b91d66ae5d947c449c8194a347969fc6bbd967729768da485ba71868df8aed2" }, "msg": "abcdef0123456789", "u": [ "0x0313d9325081b415bfd4e5364efaef392ecf69b087496973b229303e1816d2080971470f7da112c4eb43053130b785e1,0x062f84cb21ed89406890c051a0e8b9cf6c575cf6e8e18ecf63ba86826b0ae02548d83b483b79e48512b82a6c0686df8f", "0x1739123845406baa7be5c5dc74492051b6d42504de008c635f3535bb831d478a341420e67dcc7b46b2e8cba5379cca97,0x01897665d9cb5db16a27657760bbea7951f67ad68f8d55f7113f24ba6ddd82caef240a9bfa627972279974894701d975" ] }, { "P": { "x": "0x19a84dd7248a1066f737cc34502ee5555bd3c19f2ecdb3c7d9e24dc65d4e25e50d83f0f77105e955d78f4762d33c17da,0x0934aba516a52d8ae479939a91998299c76d39cc0c035cd18813bec433f587e2d7a4fef038260eef0cef4d02aae3eb91", "y": "0x14f81cd421617428bc3b9fe25afbb751d934a00493524bc4e065635b0555084dd54679df1536101b2c979c0152d09192,0x09bcccfa036b4847c9950780733633f13619994394c23ff0b32fa6b795844f4a0673e20282d07bc69641cee04f5e5662" }, "Q0": { "x": "0x09eccbc53df677f0e5814e3f86e41e146422834854a224bf5a83a50e4cc0a77bfc56718e8166ad180f53526ea9194b57,0x0c3633943f91daee715277bd644fba585168a72f96ded64fc5a384cce4ec884a4c3c30f08e09cd2129335dc8f67840ec", "y": "0x0eb6186a0457d5b12d132902d4468bfeb7315d83320b6c32f1c875f344efcba979952b4aa418589cb01af712f98cc555,0x119e3cf167e69eb16c1c7830e8df88856d48be12e3ff0a40791a5cd2f7221311d4bf13b1847f371f467357b3f3c0b4c7" }, "Q1": { "x": "0x0eb3aabc1ddfce17ff18455fcc7167d15ce6b60ddc9eb9b59f8d40ab49420d35558686293d046fc1e42f864b7f60e381,0x198bdfb19d7441ebcca61e8ff774b29d17da16547d2c10c273227a635cacea3f16826322ae85717630f0867539b5ed8b", "y": "0x0aaf1dee3adf3ed4c80e481c09b57ea4c705e1b8d25b897f0ceeec3990748716575f92abff22a1c8f4582aff7b872d52,0x0d058d9061ed27d4259848a06c96c5ca68921a5d269b078650c882cb3c2bd424a8702b7a6ee4e0ead9982baf6843e924" }, "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "u": [ "0x025820cefc7d06fd38de7d8e370e0da8a52498be9b53cba9927b2ef5c6de1e12e12f188bbc7bc923864883c57e49e253,0x034147b77ce337a52e5948f66db0bab47a8d038e712123bb381899b6ab5ad20f02805601e6104c29df18c254b8618c7b", "0x0930315cae1f9a6017c3f0c8f2314baa130e1cf13f6532bff0a8a1790cd70af918088c3db94bda214e896e1543629795,0x10c4df2cacf67ea3cb3108b00d4cbd0b3968031ebc8eac4b1ebcefe84d6b715fde66bef0219951ece29d1facc8a520ef" ] }, { "P": { "x": "0x01a6ba2f9a11fa5598b2d8ace0fbe0a0eacb65deceb476fbbcb64fd24557c2f4b18ecfc5663e54ae16a84f5ab7f62534,0x11fca2ff525572795a801eed17eb12785887c7b63fb77a42be46ce4a34131d71f7a73e95fee3f812aea3de78b4d01569", "y": "0x0b6798718c8aed24bc19cb27f866f1c9effcdbf92397ad6448b5c9db90d2b9da6cbabf48adc1adf59a1a28344e79d57e,0x03a47f8e6d1763ba0cad63d6114c0accbef65707825a511b251a660a9b3994249ae4e63fac38b23da0c398689ee2ab52" }, "Q0": { "x": "0x17cadf8d04a1a170f8347d42856526a24cc466cb2ddfd506cff01191666b7f944e31244d662c904de5440516a2b09004,0x0d13ba91f2a8b0051cf3279ea0ee63a9f19bc9cb8bfcc7d78b3cbd8cc4fc43ba726774b28038213acf2b0095391c523e", "y": "0x17ef19497d6d9246fa94d35575c0f8d06ee02f21a284dbeaa78768cb1e25abd564e3381de87bda26acd04f41181610c5,0x12c3c913ba4ed03c24f0721a81a6be7430f2971ffca8fd1729aafe496bb725807531b44b34b59b3ae5495e5a2dcbd5c8" }, "Q1": { "x": "0x16ec57b7fe04c71dfe34fb5ad84dbce5a2dbbd6ee085f1d8cd17f45e8868976fc3c51ad9eeda682c7869024d24579bfd,0x13103f7aace1ae1420d208a537f7d3a9679c287208026e4e3439ab8cd534c12856284d95e27f5e1f33eec2ce656533b0", "y": "0x0958b2c4c2c10fcef5a6c59b9e92c4a67b0fae3e2e0f1b6b5edad9c940b8f3524ba9ebbc3f2ceb3cfe377655b3163bd7,0x0ccb594ed8bd14ca64ed9cb4e0aba221be540f25dd0d6ba15a4a4be5d67bcf35df7853b2d8dad3ba245f1ea3697f66aa" }, "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "u": [ "0x190b513da3e66fc9a3587b78c76d1d132b1152174d0b83e3c1114066392579a45824c5fa17649ab89299ddd4bda54935,0x12ab625b0fe0ebd1367fe9fac57bb1168891846039b4216b9d94007b674de2d79126870e88aeef54b2ec717a887dcf39", "0x0e6a42010cf435fb5bacc156a585e1ea3294cc81d0ceb81924d95040298380b164f702275892cedd81b62de3aba3f6b5,0x117d9a0defc57a33ed208428cb84e54c85a6840e7648480ae428838989d25d97a0af8e3255be62b25c2a85630d2dddd8" ] } ] } ================================================ FILE: bindings/vectors/hash_to_curve/README ================================================ These files are downloaded from https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve/tree/master/poc/vectors, commit 6d40f98. Note the file names cannot have ":" in them as this is incompatible with Windows. ================================================ FILE: bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_256.json ================================================ { "DST": "QUUX-V01-CS02-with-expander-SHA256-128-long-DST-1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", "hash": "SHA256", "k": 128, "name": "expand_message_xmd", "tests": [ { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x20", "msg": "", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "e8dc0c8b686b7ef2074086fbdd2f30e3f8bfbd3bdf177f73f04b97ce618a3ed3" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x20", "msg": "abc", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "52dbf4f36cf560fca57dedec2ad924ee9c266341d8f3d6afe5171733b16bbb12" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x20", "msg": "abcdef0123456789", "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "35387dcf22618f3728e6c686490f8b431f76550b0b2c61cbc1ce7001536f4521" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x20", "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "01b637612bb18e840028be900a833a74414140dde0c4754c198532c3a0ba42bc" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x20", "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161002000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "20cce7033cabc5460743180be6fa8aac5a103f56d481cf369a8accc0c374431b" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x80", "msg": "", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "14604d85432c68b757e485c8894db3117992fc57e0e136f71ad987f789a0abc287c47876978e2388a02af86b1e8d1342e5ce4f7aaa07a87321e691f6fba7e0072eecc1218aebb89fb14a0662322d5edbd873f0eb35260145cd4e64f748c5dfe60567e126604bcab1a3ee2dc0778102ae8a5cfd1429ebc0fa6bf1a53c36f55dfc" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x80", "msg": "abc", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "1a30a5e36fbdb87077552b9d18b9f0aee16e80181d5b951d0471d55b66684914aef87dbb3626eaabf5ded8cd0686567e503853e5c84c259ba0efc37f71c839da2129fe81afdaec7fbdc0ccd4c794727a17c0d20ff0ea55e1389d6982d1241cb8d165762dbc39fb0cee4474d2cbbd468a835ae5b2f20e4f959f56ab24cd6fe267" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x80", "msg": "abcdef0123456789", "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "d2ecef3635d2397f34a9f86438d772db19ffe9924e28a1caf6f1c8f15603d4028f40891044e5c7e39ebb9b31339979ff33a4249206f67d4a1e7c765410bcd249ad78d407e303675918f20f26ce6d7027ed3774512ef5b00d816e51bfcc96c3539601fa48ef1c07e494bdc37054ba96ecb9dbd666417e3de289d4f424f502a982" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x80", "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "ed6e8c036df90111410431431a232d41a32c86e296c05d426e5f44e75b9a50d335b2412bc6c91e0a6dc131de09c43110d9180d0a70f0d6289cb4e43b05f7ee5e9b3f42a1fad0f31bac6a625b3b5c50e3a83316783b649e5ecc9d3b1d9471cb5024b7ccf40d41d1751a04ca0356548bc6e703fca02ab521b505e8e45600508d32" }, { "DST_prime": "412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "len_in_bytes": "0x80", "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161008000412717974da474d0f8c420f320ff81e8432adb7c927d9bd082b4fb4d16c0a23620", "uniform_bytes": "78b53f2413f3c688f07732c10e5ced29a17c6a16f717179ffbe38d92d6c9ec296502eb9889af83a1928cd162e845b0d3c5424e83280fed3d10cffb2f8431f14e7a23f4c68819d40617589e4c41169d0b56e0e3535be1fd71fbb08bb70c5b5ffed953d6c14bf7618b35fc1f4c4b30538236b4b08c9fbf90462447a8ada60be495" } ] } ================================================ FILE: bindings/vectors/hash_to_curve/expand_message_xmd_SHA256_38.json ================================================ { "DST": "QUUX-V01-CS02-with-expander-SHA256-128", "hash": "SHA256", "k": 128, "name": "expand_message_xmd", "tests": [ { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x20", "msg": "", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "68a985b87eb6b46952128911f2a4412bbc302a9d759667f87f7a21d803f07235" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x20", "msg": "abc", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "d8ccab23b5985ccea865c6c97b6e5b8350e794e603b4b97902f53a8a0d605615" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x20", "msg": "abcdef0123456789", "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "eff31487c770a893cfb36f912fbfcbff40d5661771ca4b2cb4eafe524333f5c1" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x20", "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "b23a1d2b4d97b2ef7785562a7e8bac7eed54ed6e97e29aa51bfe3f12ddad1ff9" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x20", "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161002000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "4623227bcc01293b8c130bf771da8c298dede7383243dc0993d2d94823958c4c" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x80", "msg": "", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "af84c27ccfd45d41914fdff5df25293e221afc53d8ad2ac06d5e3e29485dadbee0d121587713a3e0dd4d5e69e93eb7cd4f5df4cd103e188cf60cb02edc3edf18eda8576c412b18ffb658e3dd6ec849469b979d444cf7b26911a08e63cf31f9dcc541708d3491184472c2c29bb749d4286b004ceb5ee6b9a7fa5b646c993f0ced" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x80", "msg": "abc", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000616263008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "abba86a6129e366fc877aab32fc4ffc70120d8996c88aee2fe4b32d6c7b6437a647e6c3163d40b76a73cf6a5674ef1d890f95b664ee0afa5359a5c4e07985635bbecbac65d747d3d2da7ec2b8221b17b0ca9dc8a1ac1c07ea6a1e60583e2cb00058e77b7b72a298425cd1b941ad4ec65e8afc50303a22c0f99b0509b4c895f40" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x80", "msg": "abcdef0123456789", "msg_prime": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000061626364656630313233343536373839008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "ef904a29bffc4cf9ee82832451c946ac3c8f8058ae97d8d629831a74c6572bd9ebd0df635cd1f208e2038e760c4994984ce73f0d55ea9f22af83ba4734569d4bc95e18350f740c07eef653cbb9f87910d833751825f0ebefa1abe5420bb52be14cf489b37fe1a72f7de2d10be453b2c9d9eb20c7e3f6edc5a60629178d9478df" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x80", "msg": "q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000713132385f7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "80be107d0884f0d881bb460322f0443d38bd222db8bd0b0a5312a6fedb49c1bbd88fd75d8b9a09486c60123dfa1d73c1cc3169761b17476d3c6b7cbbd727acd0e2c942f4dd96ae3da5de368d26b32286e32de7e5a8cb2949f866a0b80c58116b29fa7fabb3ea7d520ee603e0c25bcaf0b9a5e92ec6a1fe4e0391d1cdbce8c68a" }, { "DST_prime": "515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "len_in_bytes": "0x80", "msg": "a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "msg_prime": "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000613531325f6161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161008000515555582d5630312d435330322d776974682d657870616e6465722d5348413235362d31323826", "uniform_bytes": "546aff5444b5b79aa6148bd81728704c32decb73a3ba76e9e75885cad9def1d06d6792f8a7d12794e90efed817d96920d728896a4510864370c207f99bd4a608ea121700ef01ed879745ee3e4ceef777eda6d9e5e38b90c86ea6fb0b36504ba4a45d22e86f6db5dd43d98a294bebb9125d5b794e9d2a81181066eb954966a487" } ] } ================================================ FILE: bindings/zig/README.md ================================================ # blst for [Zig](https://ziglang.org/) The object-oriented interface is modeled after [C++ interface](../blst.hpp), but at the time of writing is a subset of it, sufficient to produce and verify individual and aggregated signatures. See [tests.zig](tests.zig) for an example. C symbols are available with `blst.c.` prefix instead of `blst_`, e.g. `blst_miller_loop` is accessible as `blst.c.miller_loop`. ## Adding dependency to your project Execute ``` zig fetch --save git+https://github.com/supranational/blst ``` and add an equivalent of the following line to your build.zig prior to `b.installArtifact(exe)`: ``` exe.root_module.addImport("blst", b.dependency("blst", .{}).module("blst")); ``` You should now be able to `@import("blst")` in your application code. The abovementioned fetch command can be used to update the git reference. ================================================ FILE: bindings/zig/blst.zig ================================================ //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // DO NOT EDIT THIS FILE!!! // The file is auto-generated by generate.py //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // Copyright Supranational LLC // SPDX-License-Identifier: Apache-2.0 const std = @import("std"); pub const c = @import("c.zig"); pub const Error = error{ BAD_ENCODING, POINT_NOT_ON_CURVE, POINT_NOT_IN_GROUP, AGGR_TYPE_MISMATCH, VERIFY_FAIL, PK_IS_INFINITY, BAD_SCALAR, Unknown, }; pub const ERROR = enum(c.ERROR) { SUCCESS = c.SUCCESS, BAD_ENCODING = c.BAD_ENCODING, POINT_NOT_ON_CURVE = c.POINT_NOT_ON_CURVE, POINT_NOT_IN_GROUP = c.POINT_NOT_IN_GROUP, AGGR_TYPE_MISMATCH = c.AGGR_TYPE_MISMATCH, VERIFY_FAIL = c.VERIFY_FAIL, PK_IS_INFINITY = c.PK_IS_INFINITY, BAD_SCALAR = c.BAD_SCALAR, pub fn as_error(self: ERROR) Error { return switch (self) { .BAD_ENCODING => Error.BAD_ENCODING, .POINT_NOT_ON_CURVE => Error.POINT_NOT_ON_CURVE, .POINT_NOT_IN_GROUP => Error.POINT_NOT_IN_GROUP, .AGGR_TYPE_MISMATCH => Error.AGGR_TYPE_MISMATCH, .VERIFY_FAIL => Error.VERIFY_FAIL, .PK_IS_INFINITY => Error.PK_IS_INFINITY, .BAD_SCALAR => Error.BAD_SCALAR, else => Error.Unknown, }; } }; pub const SecretKey = struct { key: c.scalar = c.scalar{}, pub fn keygen(self: *SecretKey, IKM: []const u8, info: ?[]const u8) void { const opt = info orelse &[_]u8{}; c.keygen(&self.key, @ptrCast(IKM), IKM.len, @ptrCast(opt), opt.len); } pub fn deinit(self: *SecretKey) void { self.key = c.scalar{}; } }; pub const PT = c.fp12; pub const Pairing = struct { ctx: []u64 = &[_]u64{}, allocator: std.mem.Allocator, pub fn init(hash_or_encode: bool, DST: []const u8, allocator: std.mem.Allocator) !Pairing { const nlimbs = (c.pairing_sizeof() + @sizeOf(u64) - 1) / @sizeOf(u64); const buffer = try allocator.alloc(u64, nlimbs); c.pairing_init(@ptrCast(buffer), hash_or_encode, &DST[0], DST.len); return Pairing{ .ctx = buffer, .allocator = allocator, }; } pub fn deinit(self: *Pairing) void { self.allocator.free(self.ctx); self.ctx = &[_]u64{}; } pub fn aggregate(self: *Pairing, pk: anytype, sig: anytype, msg: []const u8, aug: ?[]const u8) ERROR { const opt = aug orelse &[_]u8{}; var err: c.ERROR = undefined; switch (@TypeOf(pk)) { *const P1_Affine, *P1_Affine => { const sigp: [*c]const c.p2_affine = switch (@TypeOf(sig)) { @TypeOf(null) => null, else => &sig.point, }; err = c.pairing_aggregate_pk_in_g1(@ptrCast(self.ctx), &pk.point, sigp, @ptrCast(msg), msg.len, @ptrCast(opt), opt.len); }, *const P2_Affine, *P2_Affine => { const sigp: [*c]const c.p1_affine = switch (@TypeOf(sig)) { @TypeOf(null) => null, else => &sig.point, }; err = c.pairing_aggregate_pk_in_g2(@ptrCast(self.ctx), &pk.point, sigp, @ptrCast(msg), msg.len, @ptrCast(opt), opt.len); }, else => |T| @compileError("expected type '*const blst.P1_Affine' " ++ "or '*const blst.P2_Affine', found '" ++ @typeName(T) ++ "'"), } return @as(ERROR, @enumFromInt(err)); } pub fn commit(self: *Pairing) void { c.pairing_commit(@ptrCast(self.ctx)); } pub fn merge(self: *Pairing, second: *const Pairing) ERROR { return c.pairing_merge(@ptrCast(self.ctx), @ptrCast(second.ctx)); } pub fn finalverify(self: *Pairing, optional: ?*const PT) bool { return c.pairing_finalverify(@ptrCast(self.ctx), optional); } pub fn raw_aggregate(self: *Pairing, q: *const P2_Affine, p: *const P1_Affine) void { c.pairing_raw_aggregate(@ptrCast(self.ctx), &q.point, &p.point); } pub fn as_fp12(self: *Pairing) *const PT { return c.pairing_as_fp12(@ptrCast(self.ctx)); } }; pub const Uniq = struct { tree: []u64 = &[_]u64{}, allocator: std.mem.Allocator, pub fn init(n: usize, allocator: std.mem.Allocator) !Uniq { const nlimbs = (c.uniq_sizeof(n) + @sizeOf(u64) - 1) / @sizeOf(u64); const buffer = try allocator.alloc(u64, nlimbs); c.uniq_init(@ptrCast(buffer)); return Uniq{ .tree = buffer, .allocator = allocator, }; } pub fn deinit(self: *Uniq) void { self.allocator.free(self.tree); self.tree = &[_]u64{}; } pub fn is_uniq(self: *Uniq, msg: []const u8) bool { return c.uniq_test(@ptrCast(self.tree), @ptrCast(msg), msg.len); } }; const FP_BYTES = 384/8; pub const P1_COMPRESS_BYTES = FP_BYTES; pub const P1_SERIALIZE_BYTES = FP_BYTES*2; pub const P2_COMPRESS_BYTES = FP_BYTES*2; pub const P2_SERIALIZE_BYTES = FP_BYTES*4; pub const P1_Affine = struct { point: c.p1_affine = c.p1_affine{}, pub fn from(in: anytype) !P1_Affine { switch (@TypeOf(in)) { *const P1, *P1 => return in.to_affine(), P1 => @compileError("expected type '*const blst.P1', found 'blst.P1'"), else => |T| { switch (@typeInfo(T)) { .pointer => { const s: []const u8 = in; _ = s; }, else => @compileError("expected type '[]const u8', found '" ++ @typeName(T) ++ "'"), } var ret: P1_Affine = undefined; const err = ret.deserialize(in); return if (err == .SUCCESS) ret else err.as_error(); }, } unreachable; } pub fn deserialize(self: *P1_Affine, in: []const u8) ERROR { if (in.len == 0) { return .BAD_ENCODING; } const expected = @as(usize, if (in[0]&0x80 != 0) P1_COMPRESS_BYTES else P1_SERIALIZE_BYTES); if (in.len != expected) { return .BAD_ENCODING; } const err = c.p1_deserialize(&self.point, &in[0]); return @as(ERROR, @enumFromInt(err)); } pub fn serialize(self: *const P1_Affine) [P1_SERIALIZE_BYTES]u8 { var ret: [P1_SERIALIZE_BYTES]u8 = undefined; c.p1_affine_serialize(&ret[0], &self.point); return ret; } pub fn compress(self: *const P1_Affine) [P1_COMPRESS_BYTES]u8 { var ret: [P1_COMPRESS_BYTES]u8 = undefined; c.p1_affine_compress(&ret[0], &self.point); return ret; } pub fn dup(self: *const P1_Affine) P1_Affine { return self.*; } pub fn on_curve(self: *const P1_Affine) bool { return c.p1_affine_on_curve(&self.point); } pub fn in_group(self: *const P1_Affine) bool { return c.p1_affine_in_g1(&self.point); } pub fn is_inf(self: *const P1_Affine) bool { return c.p1_affine_is_inf(&self.point); } pub fn is_equal(self: *const P1_Affine, p: *const P1_Affine) bool { return c.p1_affine_is_equal(&self.point, &p.point); } pub fn core_verify(self: *const P1_Affine, pk: *const P2_Affine, hash_or_encode: bool, msg: []const u8, DST: []const u8, aug: ?[]const u8) ERROR { const opt = aug orelse &[_]u8{}; const err = c.core_verify_pk_in_g2(&pk.point, &self.point, hash_or_encode, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return @as(ERROR, @enumFromInt(err)); } pub fn generator() P1_Affine { return P1_Affine{ .point = c.p1_affine_generator().*, }; } pub fn to_jacobian(self: *const P1_Affine) P1 { var ret: P1 = undefined; c.p1_from_affine(&ret.point, &self.point); return ret; } }; pub const P1 = struct { point: c.p1 = c.p1{}, pub fn from(in: anytype) !P1 { switch (@TypeOf(in)) { *const SecretKey, *SecretKey => return P1.public_key(in), SecretKey => @compileError("expected type '*const blst.SecretKey', found 'blst.SecretKey'"), *const P1_Affine, *P1_Affine => return in.to_jacobian(), P1_Affine => @compileError("expected type '*const blst.P1_Affine', found 'blst.P1_Affine'"), else => |T| { switch (@typeInfo(T)) { .pointer => { const s: []const u8 = in; _ = s; }, else => @compileError("expected type '[]const u8', found '" ++ @typeName(T) ++ "'"), } var ret: P1 = undefined; const err = ret.deserialize(in); return if (err == .SUCCESS) ret else err.as_error(); }, } unreachable; } pub fn deserialize(self: *P1, in: []const u8) ERROR { if (in.len == 0) { return .BAD_ENCODING; } const expected = @as(usize, if (in[0]&0x80 != 0) P1_COMPRESS_BYTES else P1_SERIALIZE_BYTES); if (in.len != expected) { return .BAD_ENCODING; } const err = c.p1_deserialize(@ptrCast(&self.point), &in[0]); if (err == c.SUCCESS) { c.p1_from_affine(&self.point, @ptrCast(&self.point)); } return @as(ERROR, @enumFromInt(err)); } pub fn serialize(self: *const P1) [P1_SERIALIZE_BYTES]u8 { var ret: [P1_SERIALIZE_BYTES]u8 = undefined; c.p1_serialize(&ret[0], &self.point); return ret; } pub fn compress(self: *const P1) [P1_COMPRESS_BYTES]u8 { var ret: [P1_COMPRESS_BYTES]u8 = undefined; c.p1_compress(&ret[0], &self.point); return ret; } pub fn public_key(sk: *const SecretKey) P1 { var ret: P1 = undefined; c.sk_to_pk_in_g1(&ret.point, &sk.key); return ret; } pub fn dup(self: *const P1) P1 { return self.*; } pub fn on_curve(self: *const P1) bool { return c.p1_on_curve(&self.point); } pub fn in_group(self: *const P1) bool { return c.p1_in_g1(&self.point); } pub fn is_inf(self: *const P1) bool { return c.p1_is_inf(&self.point); } pub fn is_equal(self: *const P1, p: *const P1) bool { return c.p1_is_equal(&self.point, &p.point); } pub fn aggregate(self: *P1, p: *const P1_Affine) !void { if (!c.p1_affine_in_g1(&p.point)) { return Error.POINT_NOT_IN_GROUP; } c.p1_add_or_double_affine(&self.point, &self.point, &p.point); } pub fn hash_to(msg: []const u8, DST: []const u8, aug: ?[]const u8) P1 { const opt = aug orelse &[_]u8{}; var ret: P1 = undefined; c.hash_to_g1(&ret.point, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return ret; } pub fn encode_to(msg: []const u8, DST: []const u8, aug: ?[]const u8) P1 { const opt = aug orelse &[_]u8{}; var ret: P1 = undefined; c.encode_to_g1(&ret.point, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return ret; } pub fn sign_with(self: *const P1, sk: *const SecretKey) *P1 { c.sign_pk_in_g2(@constCast(&self.point), &self.point, &sk.key); return @constCast(self); } pub fn to_affine(self: *const P1) P1_Affine { var ret: P1_Affine = undefined; c.p1_to_affine(&ret.point, &self.point); return ret; } pub fn generator() P1 { return P1{ .point = c.p1_generator().*, }; } }; pub const P2_Affine = struct { point: c.p2_affine = c.p2_affine{}, pub fn from(in: anytype) !P2_Affine { switch (@TypeOf(in)) { *const P2, *P2 => return in.to_affine(), P2 => @compileError("expected type '*const blst.P2', found 'blst.P2'"), else => |T| { switch (@typeInfo(T)) { .pointer => { const s: []const u8 = in; _ = s; }, else => @compileError("expected type '[]const u8', found '" ++ @typeName(T) ++ "'"), } var ret: P2_Affine = undefined; const err = ret.deserialize(in); return if (err == .SUCCESS) ret else err.as_error(); }, } unreachable; } pub fn deserialize(self: *P2_Affine, in: []const u8) ERROR { if (in.len == 0) { return .BAD_ENCODING; } const expected = @as(usize, if (in[0]&0x80 != 0) P2_COMPRESS_BYTES else P2_SERIALIZE_BYTES); if (in.len != expected) { return .BAD_ENCODING; } const err = c.p2_deserialize(&self.point, &in[0]); return @as(ERROR, @enumFromInt(err)); } pub fn serialize(self: *const P2_Affine) [P2_SERIALIZE_BYTES]u8 { var ret: [P2_SERIALIZE_BYTES]u8 = undefined; c.p2_affine_serialize(&ret[0], &self.point); return ret; } pub fn compress(self: *const P2_Affine) [P2_COMPRESS_BYTES]u8 { var ret: [P2_COMPRESS_BYTES]u8 = undefined; c.p2_affine_compress(&ret[0], &self.point); return ret; } pub fn dup(self: *const P2_Affine) P2_Affine { return self.*; } pub fn on_curve(self: *const P2_Affine) bool { return c.p2_affine_on_curve(&self.point); } pub fn in_group(self: *const P2_Affine) bool { return c.p2_affine_in_g2(&self.point); } pub fn is_inf(self: *const P2_Affine) bool { return c.p2_affine_is_inf(&self.point); } pub fn is_equal(self: *const P2_Affine, p: *const P2_Affine) bool { return c.p2_affine_is_equal(&self.point, &p.point); } pub fn core_verify(self: *const P2_Affine, pk: *const P1_Affine, hash_or_encode: bool, msg: []const u8, DST: []const u8, aug: ?[]const u8) ERROR { const opt = aug orelse &[_]u8{}; const err = c.core_verify_pk_in_g1(&pk.point, &self.point, hash_or_encode, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return @as(ERROR, @enumFromInt(err)); } pub fn generator() P2_Affine { return P2_Affine{ .point = c.p2_affine_generator().*, }; } pub fn to_jacobian(self: *const P2_Affine) P2 { var ret: P2 = undefined; c.p2_from_affine(&ret.point, &self.point); return ret; } }; pub const P2 = struct { point: c.p2 = c.p2{}, pub fn from(in: anytype) !P2 { switch (@TypeOf(in)) { *const SecretKey, *SecretKey => return P2.public_key(in), SecretKey => @compileError("expected type '*const blst.SecretKey', found 'blst.SecretKey'"), *const P2_Affine, *P2_Affine => return in.to_jacobian(), P2_Affine => @compileError("expected type '*const blst.P2_Affine', found 'blst.P2_Affine'"), else => |T| { switch (@typeInfo(T)) { .pointer => { const s: []const u8 = in; _ = s; }, else => @compileError("expected type '[]const u8', found '" ++ @typeName(T) ++ "'"), } var ret: P2 = undefined; const err = ret.deserialize(in); return if (err == .SUCCESS) ret else err.as_error(); }, } unreachable; } pub fn deserialize(self: *P2, in: []const u8) ERROR { if (in.len == 0) { return .BAD_ENCODING; } const expected = @as(usize, if (in[0]&0x80 != 0) P2_COMPRESS_BYTES else P2_SERIALIZE_BYTES); if (in.len != expected) { return .BAD_ENCODING; } const err = c.p2_deserialize(@ptrCast(&self.point), &in[0]); if (err == c.SUCCESS) { c.p2_from_affine(&self.point, @ptrCast(&self.point)); } return @as(ERROR, @enumFromInt(err)); } pub fn serialize(self: *const P2) [P2_SERIALIZE_BYTES]u8 { var ret: [P2_SERIALIZE_BYTES]u8 = undefined; c.p2_serialize(&ret[0], &self.point); return ret; } pub fn compress(self: *const P2) [P2_COMPRESS_BYTES]u8 { var ret: [P2_COMPRESS_BYTES]u8 = undefined; c.p2_compress(&ret[0], &self.point); return ret; } pub fn public_key(sk: *const SecretKey) P2 { var ret: P2 = undefined; c.sk_to_pk_in_g2(&ret.point, &sk.key); return ret; } pub fn dup(self: *const P2) P2 { return self.*; } pub fn on_curve(self: *const P2) bool { return c.p2_on_curve(&self.point); } pub fn in_group(self: *const P2) bool { return c.p2_in_g2(&self.point); } pub fn is_inf(self: *const P2) bool { return c.p2_is_inf(&self.point); } pub fn is_equal(self: *const P2, p: *const P2) bool { return c.p2_is_equal(&self.point, &p.point); } pub fn aggregate(self: *P2, p: *const P2_Affine) !void { if (!c.p2_affine_in_g2(&p.point)) { return Error.POINT_NOT_IN_GROUP; } c.p2_add_or_double_affine(&self.point, &self.point, &p.point); } pub fn hash_to(msg: []const u8, DST: []const u8, aug: ?[]const u8) P2 { const opt = aug orelse &[_]u8{}; var ret: P2 = undefined; c.hash_to_g2(&ret.point, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return ret; } pub fn encode_to(msg: []const u8, DST: []const u8, aug: ?[]const u8) P2 { const opt = aug orelse &[_]u8{}; var ret: P2 = undefined; c.encode_to_g2(&ret.point, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return ret; } pub fn sign_with(self: *const P2, sk: *const SecretKey) *P2 { c.sign_pk_in_g1(@constCast(&self.point), &self.point, &sk.key); return @constCast(self); } pub fn to_affine(self: *const P2) P2_Affine { var ret: P2_Affine = undefined; c.p2_to_affine(&ret.point, &self.point); return ret; } pub fn generator() P2 { return P2{ .point = c.p2_generator().*, }; } }; ================================================ FILE: bindings/zig/c.zig ================================================ // automatically generated with 'zig translate-c' const BLST_SUCCESS: c_int = 0; const BLST_BAD_ENCODING: c_int = 1; const BLST_POINT_NOT_ON_CURVE: c_int = 2; const BLST_POINT_NOT_IN_GROUP: c_int = 3; const BLST_AGGR_TYPE_MISMATCH: c_int = 4; const BLST_VERIFY_FAIL: c_int = 5; const BLST_PK_IS_INFINITY: c_int = 6; const BLST_BAD_SCALAR: c_int = 7; const BLST_ERROR = c_uint; pub const byte = u8; pub const limb_t = u64; const blst_scalar = extern struct { b: [32]byte = @import("std").mem.zeroes([32]byte), }; const blst_fr = extern struct { l: [4]limb_t = @import("std").mem.zeroes([4]limb_t), }; const blst_fp = extern struct { l: [6]limb_t = @import("std").mem.zeroes([6]limb_t), }; const blst_fp2 = extern struct { fp: [2]blst_fp = @import("std").mem.zeroes([2]blst_fp), }; const blst_fp6 = extern struct { fp2: [3]blst_fp2 = @import("std").mem.zeroes([3]blst_fp2), }; const blst_fp12 = extern struct { fp6: [2]blst_fp6 = @import("std").mem.zeroes([2]blst_fp6), }; extern fn blst_scalar_from_uint32(out: [*c]blst_scalar, a: [*c]const u32) void; extern fn blst_uint32_from_scalar(out: [*c]u32, a: [*c]const blst_scalar) void; extern fn blst_scalar_from_uint64(out: [*c]blst_scalar, a: [*c]const u64) void; extern fn blst_uint64_from_scalar(out: [*c]u64, a: [*c]const blst_scalar) void; extern fn blst_scalar_from_bendian(out: [*c]blst_scalar, a: [*c]const byte) void; extern fn blst_bendian_from_scalar(out: [*c]byte, a: [*c]const blst_scalar) void; extern fn blst_scalar_from_lendian(out: [*c]blst_scalar, a: [*c]const byte) void; extern fn blst_lendian_from_scalar(out: [*c]byte, a: [*c]const blst_scalar) void; extern fn blst_scalar_fr_check(a: [*c]const blst_scalar) bool; extern fn blst_sk_check(a: [*c]const blst_scalar) bool; extern fn blst_sk_add_n_check(out: [*c]blst_scalar, a: [*c]const blst_scalar, b: [*c]const blst_scalar) bool; extern fn blst_sk_sub_n_check(out: [*c]blst_scalar, a: [*c]const blst_scalar, b: [*c]const blst_scalar) bool; extern fn blst_sk_mul_n_check(out: [*c]blst_scalar, a: [*c]const blst_scalar, b: [*c]const blst_scalar) bool; extern fn blst_sk_inverse(out: [*c]blst_scalar, a: [*c]const blst_scalar) void; extern fn blst_scalar_from_le_bytes(out: [*c]blst_scalar, in: [*c]const byte, len: usize) bool; extern fn blst_scalar_from_be_bytes(out: [*c]blst_scalar, in: [*c]const byte, len: usize) bool; extern fn blst_fr_add(ret: [*c]blst_fr, a: [*c]const blst_fr, b: [*c]const blst_fr) void; extern fn blst_fr_sub(ret: [*c]blst_fr, a: [*c]const blst_fr, b: [*c]const blst_fr) void; extern fn blst_fr_mul_by_3(ret: [*c]blst_fr, a: [*c]const blst_fr) void; extern fn blst_fr_lshift(ret: [*c]blst_fr, a: [*c]const blst_fr, count: usize) void; extern fn blst_fr_rshift(ret: [*c]blst_fr, a: [*c]const blst_fr, count: usize) void; extern fn blst_fr_mul(ret: [*c]blst_fr, a: [*c]const blst_fr, b: [*c]const blst_fr) void; extern fn blst_fr_sqr(ret: [*c]blst_fr, a: [*c]const blst_fr) void; extern fn blst_fr_cneg(ret: [*c]blst_fr, a: [*c]const blst_fr, flag: bool) void; extern fn blst_fr_eucl_inverse(ret: [*c]blst_fr, a: [*c]const blst_fr) void; extern fn blst_fr_inverse(ret: [*c]blst_fr, a: [*c]const blst_fr) void; extern fn blst_fr_from_uint64(ret: [*c]blst_fr, a: [*c]const u64) void; extern fn blst_uint64_from_fr(ret: [*c]u64, a: [*c]const blst_fr) void; extern fn blst_fr_from_scalar(ret: [*c]blst_fr, a: [*c]const blst_scalar) void; extern fn blst_scalar_from_fr(ret: [*c]blst_scalar, a: [*c]const blst_fr) void; extern fn blst_fp_add(ret: [*c]blst_fp, a: [*c]const blst_fp, b: [*c]const blst_fp) void; extern fn blst_fp_sub(ret: [*c]blst_fp, a: [*c]const blst_fp, b: [*c]const blst_fp) void; extern fn blst_fp_mul_by_3(ret: [*c]blst_fp, a: [*c]const blst_fp) void; extern fn blst_fp_mul_by_8(ret: [*c]blst_fp, a: [*c]const blst_fp) void; extern fn blst_fp_lshift(ret: [*c]blst_fp, a: [*c]const blst_fp, count: usize) void; extern fn blst_fp_mul(ret: [*c]blst_fp, a: [*c]const blst_fp, b: [*c]const blst_fp) void; extern fn blst_fp_sqr(ret: [*c]blst_fp, a: [*c]const blst_fp) void; extern fn blst_fp_cneg(ret: [*c]blst_fp, a: [*c]const blst_fp, flag: bool) void; extern fn blst_fp_eucl_inverse(ret: [*c]blst_fp, a: [*c]const blst_fp) void; extern fn blst_fp_inverse(ret: [*c]blst_fp, a: [*c]const blst_fp) void; extern fn blst_fp_sqrt(ret: [*c]blst_fp, a: [*c]const blst_fp) bool; extern fn blst_fp_from_uint32(ret: [*c]blst_fp, a: [*c]const u32) void; extern fn blst_uint32_from_fp(ret: [*c]u32, a: [*c]const blst_fp) void; extern fn blst_fp_from_uint64(ret: [*c]blst_fp, a: [*c]const u64) void; extern fn blst_uint64_from_fp(ret: [*c]u64, a: [*c]const blst_fp) void; extern fn blst_fp_from_bendian(ret: [*c]blst_fp, a: [*c]const byte) void; extern fn blst_bendian_from_fp(ret: [*c]byte, a: [*c]const blst_fp) void; extern fn blst_fp_from_lendian(ret: [*c]blst_fp, a: [*c]const byte) void; extern fn blst_lendian_from_fp(ret: [*c]byte, a: [*c]const blst_fp) void; extern fn blst_fp2_add(ret: [*c]blst_fp2, a: [*c]const blst_fp2, b: [*c]const blst_fp2) void; extern fn blst_fp2_sub(ret: [*c]blst_fp2, a: [*c]const blst_fp2, b: [*c]const blst_fp2) void; extern fn blst_fp2_mul_by_3(ret: [*c]blst_fp2, a: [*c]const blst_fp2) void; extern fn blst_fp2_mul_by_8(ret: [*c]blst_fp2, a: [*c]const blst_fp2) void; extern fn blst_fp2_lshift(ret: [*c]blst_fp2, a: [*c]const blst_fp2, count: usize) void; extern fn blst_fp2_mul(ret: [*c]blst_fp2, a: [*c]const blst_fp2, b: [*c]const blst_fp2) void; extern fn blst_fp2_sqr(ret: [*c]blst_fp2, a: [*c]const blst_fp2) void; extern fn blst_fp2_cneg(ret: [*c]blst_fp2, a: [*c]const blst_fp2, flag: bool) void; extern fn blst_fp2_eucl_inverse(ret: [*c]blst_fp2, a: [*c]const blst_fp2) void; extern fn blst_fp2_inverse(ret: [*c]blst_fp2, a: [*c]const blst_fp2) void; extern fn blst_fp2_sqrt(ret: [*c]blst_fp2, a: [*c]const blst_fp2) bool; extern fn blst_fp12_sqr(ret: [*c]blst_fp12, a: [*c]const blst_fp12) void; extern fn blst_fp12_cyclotomic_sqr(ret: [*c]blst_fp12, a: [*c]const blst_fp12) void; extern fn blst_fp12_mul(ret: [*c]blst_fp12, a: [*c]const blst_fp12, b: [*c]const blst_fp12) void; extern fn blst_fp12_mul_by_xy00z0(ret: [*c]blst_fp12, a: [*c]const blst_fp12, xy00z0: [*c]const blst_fp6) void; extern fn blst_fp12_conjugate(a: [*c]blst_fp12) void; extern fn blst_fp12_inverse(ret: [*c]blst_fp12, a: [*c]const blst_fp12) void; extern fn blst_fp12_frobenius_map(ret: [*c]blst_fp12, a: [*c]const blst_fp12, n: usize) void; extern fn blst_fp12_is_equal(a: [*c]const blst_fp12, b: [*c]const blst_fp12) bool; extern fn blst_fp12_is_one(a: [*c]const blst_fp12) bool; extern fn blst_fp12_in_group(a: [*c]const blst_fp12) bool; extern fn blst_fp12_one() [*c]const blst_fp12; const blst_p1 = extern struct { x: blst_fp = @import("std").mem.zeroes(blst_fp), y: blst_fp = @import("std").mem.zeroes(blst_fp), z: blst_fp = @import("std").mem.zeroes(blst_fp), }; const blst_p1_affine = extern struct { x: blst_fp = @import("std").mem.zeroes(blst_fp), y: blst_fp = @import("std").mem.zeroes(blst_fp), }; extern fn blst_p1_add(out: [*c]blst_p1, a: [*c]const blst_p1, b: [*c]const blst_p1) void; extern fn blst_p1_add_or_double(out: [*c]blst_p1, a: [*c]const blst_p1, b: [*c]const blst_p1) void; extern fn blst_p1_add_affine(out: [*c]blst_p1, a: [*c]const blst_p1, b: [*c]const blst_p1_affine) void; extern fn blst_p1_add_or_double_affine(out: [*c]blst_p1, a: [*c]const blst_p1, b: [*c]const blst_p1_affine) void; extern fn blst_p1_double(out: [*c]blst_p1, a: [*c]const blst_p1) void; extern fn blst_p1_mult(out: [*c]blst_p1, p: [*c]const blst_p1, scalar: [*c]const byte, nbits: usize) void; extern fn blst_p1_cneg(p: [*c]blst_p1, cbit: bool) void; extern fn blst_p1_to_affine(out: [*c]blst_p1_affine, in: [*c]const blst_p1) void; extern fn blst_p1_from_affine(out: [*c]blst_p1, in: [*c]const blst_p1_affine) void; extern fn blst_p1_on_curve(p: [*c]const blst_p1) bool; extern fn blst_p1_in_g1(p: [*c]const blst_p1) bool; extern fn blst_p1_is_equal(a: [*c]const blst_p1, b: [*c]const blst_p1) bool; extern fn blst_p1_is_inf(a: [*c]const blst_p1) bool; extern fn blst_p1_generator() [*c]const blst_p1; extern fn blst_p1_affine_on_curve(p: [*c]const blst_p1_affine) bool; extern fn blst_p1_affine_in_g1(p: [*c]const blst_p1_affine) bool; extern fn blst_p1_affine_is_equal(a: [*c]const blst_p1_affine, b: [*c]const blst_p1_affine) bool; extern fn blst_p1_affine_is_inf(a: [*c]const blst_p1_affine) bool; extern fn blst_p1_affine_generator() [*c]const blst_p1_affine; const blst_p2 = extern struct { x: blst_fp2 = @import("std").mem.zeroes(blst_fp2), y: blst_fp2 = @import("std").mem.zeroes(blst_fp2), z: blst_fp2 = @import("std").mem.zeroes(blst_fp2), }; const blst_p2_affine = extern struct { x: blst_fp2 = @import("std").mem.zeroes(blst_fp2), y: blst_fp2 = @import("std").mem.zeroes(blst_fp2), }; extern fn blst_p2_add(out: [*c]blst_p2, a: [*c]const blst_p2, b: [*c]const blst_p2) void; extern fn blst_p2_add_or_double(out: [*c]blst_p2, a: [*c]const blst_p2, b: [*c]const blst_p2) void; extern fn blst_p2_add_affine(out: [*c]blst_p2, a: [*c]const blst_p2, b: [*c]const blst_p2_affine) void; extern fn blst_p2_add_or_double_affine(out: [*c]blst_p2, a: [*c]const blst_p2, b: [*c]const blst_p2_affine) void; extern fn blst_p2_double(out: [*c]blst_p2, a: [*c]const blst_p2) void; extern fn blst_p2_mult(out: [*c]blst_p2, p: [*c]const blst_p2, scalar: [*c]const byte, nbits: usize) void; extern fn blst_p2_cneg(p: [*c]blst_p2, cbit: bool) void; extern fn blst_p2_to_affine(out: [*c]blst_p2_affine, in: [*c]const blst_p2) void; extern fn blst_p2_from_affine(out: [*c]blst_p2, in: [*c]const blst_p2_affine) void; extern fn blst_p2_on_curve(p: [*c]const blst_p2) bool; extern fn blst_p2_in_g2(p: [*c]const blst_p2) bool; extern fn blst_p2_is_equal(a: [*c]const blst_p2, b: [*c]const blst_p2) bool; extern fn blst_p2_is_inf(a: [*c]const blst_p2) bool; extern fn blst_p2_generator() [*c]const blst_p2; extern fn blst_p2_affine_on_curve(p: [*c]const blst_p2_affine) bool; extern fn blst_p2_affine_in_g2(p: [*c]const blst_p2_affine) bool; extern fn blst_p2_affine_is_equal(a: [*c]const blst_p2_affine, b: [*c]const blst_p2_affine) bool; extern fn blst_p2_affine_is_inf(a: [*c]const blst_p2_affine) bool; extern fn blst_p2_affine_generator() [*c]const blst_p2_affine; extern fn blst_p1s_to_affine(dst: [*c]blst_p1_affine, points: [*c]const [*c]const blst_p1, npoints: usize) void; extern fn blst_p1s_add(ret: [*c]blst_p1, points: [*c]const [*c]const blst_p1_affine, npoints: usize) void; extern fn blst_p1s_mult_wbits_precompute_sizeof(wbits: usize, npoints: usize) usize; extern fn blst_p1s_mult_wbits_precompute(table: [*c]blst_p1_affine, wbits: usize, points: [*c]const [*c]const blst_p1_affine, npoints: usize) void; extern fn blst_p1s_mult_wbits_scratch_sizeof(npoints: usize) usize; extern fn blst_p1s_mult_wbits(ret: [*c]blst_p1, table: [*c]const blst_p1_affine, wbits: usize, npoints: usize, scalars: [*c]const [*c]const byte, nbits: usize, scratch: [*c]limb_t) void; extern fn blst_p1s_mult_pippenger_scratch_sizeof(npoints: usize) usize; extern fn blst_p1s_mult_pippenger(ret: [*c]blst_p1, points: [*c]const [*c]const blst_p1_affine, npoints: usize, scalars: [*c]const [*c]const byte, nbits: usize, scratch: [*c]limb_t) void; extern fn blst_p1s_tile_pippenger(ret: [*c]blst_p1, points: [*c]const [*c]const blst_p1_affine, npoints: usize, scalars: [*c]const [*c]const byte, nbits: usize, scratch: [*c]limb_t, bit0: usize, window: usize) void; extern fn blst_p2s_to_affine(dst: [*c]blst_p2_affine, points: [*c]const [*c]const blst_p2, npoints: usize) void; extern fn blst_p2s_add(ret: [*c]blst_p2, points: [*c]const [*c]const blst_p2_affine, npoints: usize) void; extern fn blst_p2s_mult_wbits_precompute_sizeof(wbits: usize, npoints: usize) usize; extern fn blst_p2s_mult_wbits_precompute(table: [*c]blst_p2_affine, wbits: usize, points: [*c]const [*c]const blst_p2_affine, npoints: usize) void; extern fn blst_p2s_mult_wbits_scratch_sizeof(npoints: usize) usize; extern fn blst_p2s_mult_wbits(ret: [*c]blst_p2, table: [*c]const blst_p2_affine, wbits: usize, npoints: usize, scalars: [*c]const [*c]const byte, nbits: usize, scratch: [*c]limb_t) void; extern fn blst_p2s_mult_pippenger_scratch_sizeof(npoints: usize) usize; extern fn blst_p2s_mult_pippenger(ret: [*c]blst_p2, points: [*c]const [*c]const blst_p2_affine, npoints: usize, scalars: [*c]const [*c]const byte, nbits: usize, scratch: [*c]limb_t) void; extern fn blst_p2s_tile_pippenger(ret: [*c]blst_p2, points: [*c]const [*c]const blst_p2_affine, npoints: usize, scalars: [*c]const [*c]const byte, nbits: usize, scratch: [*c]limb_t, bit0: usize, window: usize) void; extern fn blst_map_to_g1(out: [*c]blst_p1, u: [*c]const blst_fp, v: [*c]const blst_fp) void; extern fn blst_map_to_g2(out: [*c]blst_p2, u: [*c]const blst_fp2, v: [*c]const blst_fp2) void; extern fn blst_encode_to_g1(out: [*c]blst_p1, msg: [*c]const byte, msg_len: usize, DST: [*c]const byte, DST_len: usize, aug: [*c]const byte, aug_len: usize) void; extern fn blst_hash_to_g1(out: [*c]blst_p1, msg: [*c]const byte, msg_len: usize, DST: [*c]const byte, DST_len: usize, aug: [*c]const byte, aug_len: usize) void; extern fn blst_encode_to_g2(out: [*c]blst_p2, msg: [*c]const byte, msg_len: usize, DST: [*c]const byte, DST_len: usize, aug: [*c]const byte, aug_len: usize) void; extern fn blst_hash_to_g2(out: [*c]blst_p2, msg: [*c]const byte, msg_len: usize, DST: [*c]const byte, DST_len: usize, aug: [*c]const byte, aug_len: usize) void; extern fn blst_p1_serialize(out: [*c]byte, in: [*c]const blst_p1) void; extern fn blst_p1_compress(out: [*c]byte, in: [*c]const blst_p1) void; extern fn blst_p1_affine_serialize(out: [*c]byte, in: [*c]const blst_p1_affine) void; extern fn blst_p1_affine_compress(out: [*c]byte, in: [*c]const blst_p1_affine) void; extern fn blst_p1_uncompress(out: [*c]blst_p1_affine, in: [*c]const byte) BLST_ERROR; extern fn blst_p1_deserialize(out: [*c]blst_p1_affine, in: [*c]const byte) BLST_ERROR; extern fn blst_p2_serialize(out: [*c]byte, in: [*c]const blst_p2) void; extern fn blst_p2_compress(out: [*c]byte, in: [*c]const blst_p2) void; extern fn blst_p2_affine_serialize(out: [*c]byte, in: [*c]const blst_p2_affine) void; extern fn blst_p2_affine_compress(out: [*c]byte, in: [*c]const blst_p2_affine) void; extern fn blst_p2_uncompress(out: [*c]blst_p2_affine, in: [*c]const byte) BLST_ERROR; extern fn blst_p2_deserialize(out: [*c]blst_p2_affine, in: [*c]const byte) BLST_ERROR; extern fn blst_keygen(out_SK: [*c]blst_scalar, IKM: [*c]const byte, IKM_len: usize, info: [*c]const byte, info_len: usize) void; extern fn blst_sk_to_pk_in_g1(out_pk: [*c]blst_p1, SK: [*c]const blst_scalar) void; extern fn blst_sign_pk_in_g1(out_sig: [*c]blst_p2, hash: [*c]const blst_p2, SK: [*c]const blst_scalar) void; extern fn blst_sk_to_pk_in_g2(out_pk: [*c]blst_p2, SK: [*c]const blst_scalar) void; extern fn blst_sign_pk_in_g2(out_sig: [*c]blst_p1, hash: [*c]const blst_p1, SK: [*c]const blst_scalar) void; extern fn blst_miller_loop(ret: [*c]blst_fp12, Q: [*c]const blst_p2_affine, P: [*c]const blst_p1_affine) void; extern fn blst_miller_loop_n(ret: [*c]blst_fp12, Qs: [*c]const [*c]const blst_p2_affine, Ps: [*c]const [*c]const blst_p1_affine, n: usize) void; extern fn blst_final_exp(ret: [*c]blst_fp12, f: [*c]const blst_fp12) void; extern fn blst_precompute_lines(Qlines: [*c]blst_fp6, Q: [*c]const blst_p2_affine) void; extern fn blst_miller_loop_lines(ret: [*c]blst_fp12, Qlines: [*c]const blst_fp6, P: [*c]const blst_p1_affine) void; extern fn blst_fp12_finalverify(gt1: [*c]const blst_fp12, gt2: [*c]const blst_fp12) bool; pub const struct_blst_opaque = opaque {}; const blst_pairing = struct_blst_opaque; extern fn blst_pairing_sizeof() usize; extern fn blst_pairing_init(new_ctx: ?*blst_pairing, hash_or_encode: bool, DST: [*c]const byte, DST_len: usize) void; extern fn blst_pairing_get_dst(ctx: ?*const blst_pairing) [*c]const byte; extern fn blst_pairing_commit(ctx: ?*blst_pairing) void; extern fn blst_pairing_aggregate_pk_in_g2(ctx: ?*blst_pairing, PK: [*c]const blst_p2_affine, signature: [*c]const blst_p1_affine, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_chk_n_aggr_pk_in_g2(ctx: ?*blst_pairing, PK: [*c]const blst_p2_affine, pk_grpchk: bool, signature: [*c]const blst_p1_affine, sig_grpchk: bool, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_mul_n_aggregate_pk_in_g2(ctx: ?*blst_pairing, PK: [*c]const blst_p2_affine, sig: [*c]const blst_p1_affine, scalar: [*c]const byte, nbits: usize, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_chk_n_mul_n_aggr_pk_in_g2(ctx: ?*blst_pairing, PK: [*c]const blst_p2_affine, pk_grpchk: bool, sig: [*c]const blst_p1_affine, sig_grpchk: bool, scalar: [*c]const byte, nbits: usize, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_aggregate_pk_in_g1(ctx: ?*blst_pairing, PK: [*c]const blst_p1_affine, signature: [*c]const blst_p2_affine, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_chk_n_aggr_pk_in_g1(ctx: ?*blst_pairing, PK: [*c]const blst_p1_affine, pk_grpchk: bool, signature: [*c]const blst_p2_affine, sig_grpchk: bool, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_mul_n_aggregate_pk_in_g1(ctx: ?*blst_pairing, PK: [*c]const blst_p1_affine, sig: [*c]const blst_p2_affine, scalar: [*c]const byte, nbits: usize, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_chk_n_mul_n_aggr_pk_in_g1(ctx: ?*blst_pairing, PK: [*c]const blst_p1_affine, pk_grpchk: bool, sig: [*c]const blst_p2_affine, sig_grpchk: bool, scalar: [*c]const byte, nbits: usize, msg: [*c]const byte, msg_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_pairing_merge(ctx: ?*blst_pairing, ctx1: ?*const blst_pairing) BLST_ERROR; extern fn blst_pairing_finalverify(ctx: ?*const blst_pairing, gtsig: [*c]const blst_fp12) bool; extern fn blst_aggregate_in_g1(out: [*c]blst_p1, in: [*c]const blst_p1, zwire: [*c]const byte) BLST_ERROR; extern fn blst_aggregate_in_g2(out: [*c]blst_p2, in: [*c]const blst_p2, zwire: [*c]const byte) BLST_ERROR; extern fn blst_aggregated_in_g1(out: [*c]blst_fp12, signature: [*c]const blst_p1_affine) void; extern fn blst_aggregated_in_g2(out: [*c]blst_fp12, signature: [*c]const blst_p2_affine) void; extern fn blst_core_verify_pk_in_g1(pk: [*c]const blst_p1_affine, signature: [*c]const blst_p2_affine, hash_or_encode: bool, msg: [*c]const byte, msg_len: usize, DST: [*c]const byte, DST_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; extern fn blst_core_verify_pk_in_g2(pk: [*c]const blst_p2_affine, signature: [*c]const blst_p1_affine, hash_or_encode: bool, msg: [*c]const byte, msg_len: usize, DST: [*c]const byte, DST_len: usize, aug: [*c]const byte, aug_len: usize) BLST_ERROR; pub extern const BLS12_381_G1: blst_p1_affine; pub extern const BLS12_381_NEG_G1: blst_p1_affine; pub extern const BLS12_381_G2: blst_p2_affine; pub extern const BLS12_381_NEG_G2: blst_p2_affine; extern fn blst_fr_ct_bfly(x0: [*c]blst_fr, x1: [*c]blst_fr, twiddle: [*c]const blst_fr) void; extern fn blst_fr_gs_bfly(x0: [*c]blst_fr, x1: [*c]blst_fr, twiddle: [*c]const blst_fr) void; extern fn blst_fr_to(ret: [*c]blst_fr, a: [*c]const blst_fr) void; extern fn blst_fr_from(ret: [*c]blst_fr, a: [*c]const blst_fr) void; extern fn blst_fp_to(ret: [*c]blst_fp, a: [*c]const blst_fp) void; extern fn blst_fp_from(ret: [*c]blst_fp, a: [*c]const blst_fp) void; extern fn blst_fp_is_square(a: [*c]const blst_fp) bool; extern fn blst_fp2_is_square(a: [*c]const blst_fp2) bool; extern fn blst_p1_from_jacobian(out: [*c]blst_p1, in: [*c]const blst_p1) void; extern fn blst_p2_from_jacobian(out: [*c]blst_p2, in: [*c]const blst_p2) void; extern fn blst_sk_to_pk2_in_g1(out: [*c]byte, out_pk: [*c]blst_p1_affine, SK: [*c]const blst_scalar) void; extern fn blst_sign_pk2_in_g1(out: [*c]byte, out_sig: [*c]blst_p2_affine, hash: [*c]const blst_p2, SK: [*c]const blst_scalar) void; extern fn blst_sk_to_pk2_in_g2(out: [*c]byte, out_pk: [*c]blst_p2_affine, SK: [*c]const blst_scalar) void; extern fn blst_sign_pk2_in_g2(out: [*c]byte, out_sig: [*c]blst_p1_affine, hash: [*c]const blst_p1, SK: [*c]const blst_scalar) void; const blst_uniq = struct_blst_opaque; extern fn blst_uniq_sizeof(n_nodes: usize) usize; extern fn blst_uniq_init(tree: ?*blst_uniq) void; extern fn blst_uniq_test(tree: ?*blst_uniq, msg: [*c]const byte, len: usize) bool; extern fn blst_expand_message_xmd(out: [*c]byte, out_len: usize, msg: [*c]const byte, msg_len: usize, DST: [*c]const byte, DST_len: usize) void; extern fn blst_p1_unchecked_mult(out: [*c]blst_p1, p: [*c]const blst_p1, scalar: [*c]const byte, nbits: usize) void; extern fn blst_p2_unchecked_mult(out: [*c]blst_p2, p: [*c]const blst_p2, scalar: [*c]const byte, nbits: usize) void; extern fn blst_pairing_raw_aggregate(ctx: ?*blst_pairing, q: [*c]const blst_p2_affine, p: [*c]const blst_p1_affine) void; extern fn blst_pairing_as_fp12(ctx: ?*blst_pairing) [*c]blst_fp12; extern fn blst_bendian_from_fp12(out: [*c]byte, a: [*c]const blst_fp12) void; extern fn blst_keygen_v3(out_SK: [*c]blst_scalar, IKM: [*c]const byte, IKM_len: usize, info: [*c]const byte, info_len: usize) void; extern fn blst_keygen_v4_5(out_SK: [*c]blst_scalar, IKM: [*c]const byte, IKM_len: usize, salt: [*c]const byte, salt_len: usize, info: [*c]const byte, info_len: usize) void; extern fn blst_keygen_v5(out_SK: [*c]blst_scalar, IKM: [*c]const byte, IKM_len: usize, salt: [*c]const byte, salt_len: usize, info: [*c]const byte, info_len: usize) void; extern fn blst_derive_master_eip2333(out_SK: [*c]blst_scalar, IKM: [*c]const byte, IKM_len: usize) void; extern fn blst_derive_child_eip2333(out_SK: [*c]blst_scalar, SK: [*c]const blst_scalar, child_index: u32) void; extern fn blst_scalar_from_hexascii(out: [*c]blst_scalar, hex: [*c]const byte) void; extern fn blst_fr_from_hexascii(ret: [*c]blst_fr, hex: [*c]const byte) void; extern fn blst_fp_from_hexascii(ret: [*c]blst_fp, hex: [*c]const byte) void; extern fn blst_p1_sizeof() usize; extern fn blst_p1_affine_sizeof() usize; extern fn blst_p2_sizeof() usize; extern fn blst_p2_affine_sizeof() usize; extern fn blst_fp12_sizeof() usize; extern fn blst_fp_from_le_bytes(ret: [*c]blst_fp, in: [*c]const byte, len: usize) void; extern fn blst_fp_from_be_bytes(ret: [*c]blst_fp, in: [*c]const byte, len: usize) void; extern fn blst_sha256(out: [*c]byte, msg: [*c]const byte, msg_len: usize) void; // reexport symbols without blst_ prefix pub const SUCCESS = BLST_SUCCESS; pub const BAD_ENCODING = BLST_BAD_ENCODING; pub const POINT_NOT_ON_CURVE = BLST_POINT_NOT_ON_CURVE; pub const POINT_NOT_IN_GROUP = BLST_POINT_NOT_IN_GROUP; pub const AGGR_TYPE_MISMATCH = BLST_AGGR_TYPE_MISMATCH; pub const VERIFY_FAIL = BLST_VERIFY_FAIL; pub const PK_IS_INFINITY = BLST_PK_IS_INFINITY; pub const BAD_SCALAR = BLST_BAD_SCALAR; pub const ERROR = BLST_ERROR; pub const scalar = blst_scalar; pub const fr = blst_fr; pub const fp = blst_fp; pub const fp2 = blst_fp2; pub const fp6 = blst_fp6; pub const fp12 = blst_fp12; pub const scalar_from_uint32 = blst_scalar_from_uint32; pub const uint32_from_scalar = blst_uint32_from_scalar; pub const scalar_from_uint64 = blst_scalar_from_uint64; pub const uint64_from_scalar = blst_uint64_from_scalar; pub const scalar_from_bendian = blst_scalar_from_bendian; pub const bendian_from_scalar = blst_bendian_from_scalar; pub const scalar_from_lendian = blst_scalar_from_lendian; pub const lendian_from_scalar = blst_lendian_from_scalar; pub const scalar_fr_check = blst_scalar_fr_check; pub const sk_check = blst_sk_check; pub const sk_add_n_check = blst_sk_add_n_check; pub const sk_sub_n_check = blst_sk_sub_n_check; pub const sk_mul_n_check = blst_sk_mul_n_check; pub const sk_inverse = blst_sk_inverse; pub const scalar_from_le_bytes = blst_scalar_from_le_bytes; pub const scalar_from_be_bytes = blst_scalar_from_be_bytes; pub const fr_add = blst_fr_add; pub const fr_sub = blst_fr_sub; pub const fr_mul_by_3 = blst_fr_mul_by_3; pub const fr_lshift = blst_fr_lshift; pub const fr_rshift = blst_fr_rshift; pub const fr_mul = blst_fr_mul; pub const fr_sqr = blst_fr_sqr; pub const fr_cneg = blst_fr_cneg; pub const fr_eucl_inverse = blst_fr_eucl_inverse; pub const fr_inverse = blst_fr_inverse; pub const fr_from_uint64 = blst_fr_from_uint64; pub const uint64_from_fr = blst_uint64_from_fr; pub const fr_from_scalar = blst_fr_from_scalar; pub const scalar_from_fr = blst_scalar_from_fr; pub const fp_add = blst_fp_add; pub const fp_sub = blst_fp_sub; pub const fp_mul_by_3 = blst_fp_mul_by_3; pub const fp_mul_by_8 = blst_fp_mul_by_8; pub const fp_lshift = blst_fp_lshift; pub const fp_mul = blst_fp_mul; pub const fp_sqr = blst_fp_sqr; pub const fp_cneg = blst_fp_cneg; pub const fp_eucl_inverse = blst_fp_eucl_inverse; pub const fp_inverse = blst_fp_inverse; pub const fp_sqrt = blst_fp_sqrt; pub const fp_from_uint32 = blst_fp_from_uint32; pub const uint32_from_fp = blst_uint32_from_fp; pub const fp_from_uint64 = blst_fp_from_uint64; pub const uint64_from_fp = blst_uint64_from_fp; pub const fp_from_bendian = blst_fp_from_bendian; pub const bendian_from_fp = blst_bendian_from_fp; pub const fp_from_lendian = blst_fp_from_lendian; pub const lendian_from_fp = blst_lendian_from_fp; pub const fp2_add = blst_fp2_add; pub const fp2_sub = blst_fp2_sub; pub const fp2_mul_by_3 = blst_fp2_mul_by_3; pub const fp2_mul_by_8 = blst_fp2_mul_by_8; pub const fp2_lshift = blst_fp2_lshift; pub const fp2_mul = blst_fp2_mul; pub const fp2_sqr = blst_fp2_sqr; pub const fp2_cneg = blst_fp2_cneg; pub const fp2_eucl_inverse = blst_fp2_eucl_inverse; pub const fp2_inverse = blst_fp2_inverse; pub const fp2_sqrt = blst_fp2_sqrt; pub const fp12_sqr = blst_fp12_sqr; pub const fp12_cyclotomic_sqr = blst_fp12_cyclotomic_sqr; pub const fp12_mul = blst_fp12_mul; pub const fp12_mul_by_xy00z0 = blst_fp12_mul_by_xy00z0; pub const fp12_conjugate = blst_fp12_conjugate; pub const fp12_inverse = blst_fp12_inverse; pub const fp12_frobenius_map = blst_fp12_frobenius_map; pub const fp12_is_equal = blst_fp12_is_equal; pub const fp12_is_one = blst_fp12_is_one; pub const fp12_in_group = blst_fp12_in_group; pub const fp12_one = blst_fp12_one; pub const p1 = blst_p1; pub const p1_affine = blst_p1_affine; pub const p1_add = blst_p1_add; pub const p1_add_or_double = blst_p1_add_or_double; pub const p1_add_affine = blst_p1_add_affine; pub const p1_add_or_double_affine = blst_p1_add_or_double_affine; pub const p1_double = blst_p1_double; pub const p1_mult = blst_p1_mult; pub const p1_cneg = blst_p1_cneg; pub const p1_to_affine = blst_p1_to_affine; pub const p1_from_affine = blst_p1_from_affine; pub const p1_on_curve = blst_p1_on_curve; pub const p1_in_g1 = blst_p1_in_g1; pub const p1_is_equal = blst_p1_is_equal; pub const p1_is_inf = blst_p1_is_inf; pub const p1_generator = blst_p1_generator; pub const p1_affine_on_curve = blst_p1_affine_on_curve; pub const p1_affine_in_g1 = blst_p1_affine_in_g1; pub const p1_affine_is_equal = blst_p1_affine_is_equal; pub const p1_affine_is_inf = blst_p1_affine_is_inf; pub const p1_affine_generator = blst_p1_affine_generator; pub const p2 = blst_p2; pub const p2_affine = blst_p2_affine; pub const p2_add = blst_p2_add; pub const p2_add_or_double = blst_p2_add_or_double; pub const p2_add_affine = blst_p2_add_affine; pub const p2_add_or_double_affine = blst_p2_add_or_double_affine; pub const p2_double = blst_p2_double; pub const p2_mult = blst_p2_mult; pub const p2_cneg = blst_p2_cneg; pub const p2_to_affine = blst_p2_to_affine; pub const p2_from_affine = blst_p2_from_affine; pub const p2_on_curve = blst_p2_on_curve; pub const p2_in_g2 = blst_p2_in_g2; pub const p2_is_equal = blst_p2_is_equal; pub const p2_is_inf = blst_p2_is_inf; pub const p2_generator = blst_p2_generator; pub const p2_affine_on_curve = blst_p2_affine_on_curve; pub const p2_affine_in_g2 = blst_p2_affine_in_g2; pub const p2_affine_is_equal = blst_p2_affine_is_equal; pub const p2_affine_is_inf = blst_p2_affine_is_inf; pub const p2_affine_generator = blst_p2_affine_generator; pub const p1s_to_affine = blst_p1s_to_affine; pub const p1s_add = blst_p1s_add; pub const p1s_mult_wbits_precompute_sizeof = blst_p1s_mult_wbits_precompute_sizeof; pub const p1s_mult_wbits_precompute = blst_p1s_mult_wbits_precompute; pub const p1s_mult_wbits_scratch_sizeof = blst_p1s_mult_wbits_scratch_sizeof; pub const p1s_mult_wbits = blst_p1s_mult_wbits; pub const p1s_mult_pippenger_scratch_sizeof = blst_p1s_mult_pippenger_scratch_sizeof; pub const p1s_mult_pippenger = blst_p1s_mult_pippenger; pub const p1s_tile_pippenger = blst_p1s_tile_pippenger; pub const p2s_to_affine = blst_p2s_to_affine; pub const p2s_add = blst_p2s_add; pub const p2s_mult_wbits_precompute_sizeof = blst_p2s_mult_wbits_precompute_sizeof; pub const p2s_mult_wbits_precompute = blst_p2s_mult_wbits_precompute; pub const p2s_mult_wbits_scratch_sizeof = blst_p2s_mult_wbits_scratch_sizeof; pub const p2s_mult_wbits = blst_p2s_mult_wbits; pub const p2s_mult_pippenger_scratch_sizeof = blst_p2s_mult_pippenger_scratch_sizeof; pub const p2s_mult_pippenger = blst_p2s_mult_pippenger; pub const p2s_tile_pippenger = blst_p2s_tile_pippenger; pub const map_to_g1 = blst_map_to_g1; pub const map_to_g2 = blst_map_to_g2; pub const encode_to_g1 = blst_encode_to_g1; pub const hash_to_g1 = blst_hash_to_g1; pub const encode_to_g2 = blst_encode_to_g2; pub const hash_to_g2 = blst_hash_to_g2; pub const p1_serialize = blst_p1_serialize; pub const p1_compress = blst_p1_compress; pub const p1_affine_serialize = blst_p1_affine_serialize; pub const p1_affine_compress = blst_p1_affine_compress; pub const p1_uncompress = blst_p1_uncompress; pub const p1_deserialize = blst_p1_deserialize; pub const p2_serialize = blst_p2_serialize; pub const p2_compress = blst_p2_compress; pub const p2_affine_serialize = blst_p2_affine_serialize; pub const p2_affine_compress = blst_p2_affine_compress; pub const p2_uncompress = blst_p2_uncompress; pub const p2_deserialize = blst_p2_deserialize; pub const keygen = blst_keygen; pub const sk_to_pk_in_g1 = blst_sk_to_pk_in_g1; pub const sign_pk_in_g1 = blst_sign_pk_in_g1; pub const sk_to_pk_in_g2 = blst_sk_to_pk_in_g2; pub const sign_pk_in_g2 = blst_sign_pk_in_g2; pub const miller_loop = blst_miller_loop; pub const miller_loop_n = blst_miller_loop_n; pub const final_exp = blst_final_exp; pub const precompute_lines = blst_precompute_lines; pub const miller_loop_lines = blst_miller_loop_lines; pub const fp12_finalverify = blst_fp12_finalverify; pub const pairing = blst_pairing; pub const pairing_sizeof = blst_pairing_sizeof; pub const pairing_init = blst_pairing_init; pub const pairing_get_dst = blst_pairing_get_dst; pub const pairing_commit = blst_pairing_commit; pub const pairing_aggregate_pk_in_g2 = blst_pairing_aggregate_pk_in_g2; pub const pairing_chk_n_aggr_pk_in_g2 = blst_pairing_chk_n_aggr_pk_in_g2; pub const pairing_mul_n_aggregate_pk_in_g2 = blst_pairing_mul_n_aggregate_pk_in_g2; pub const pairing_chk_n_mul_n_aggr_pk_in_g2 = blst_pairing_chk_n_mul_n_aggr_pk_in_g2; pub const pairing_aggregate_pk_in_g1 = blst_pairing_aggregate_pk_in_g1; pub const pairing_chk_n_aggr_pk_in_g1 = blst_pairing_chk_n_aggr_pk_in_g1; pub const pairing_mul_n_aggregate_pk_in_g1 = blst_pairing_mul_n_aggregate_pk_in_g1; pub const pairing_chk_n_mul_n_aggr_pk_in_g1 = blst_pairing_chk_n_mul_n_aggr_pk_in_g1; pub const pairing_merge = blst_pairing_merge; pub const pairing_finalverify = blst_pairing_finalverify; pub const aggregate_in_g1 = blst_aggregate_in_g1; pub const aggregate_in_g2 = blst_aggregate_in_g2; pub const aggregated_in_g1 = blst_aggregated_in_g1; pub const aggregated_in_g2 = blst_aggregated_in_g2; pub const core_verify_pk_in_g1 = blst_core_verify_pk_in_g1; pub const core_verify_pk_in_g2 = blst_core_verify_pk_in_g2; pub const fr_ct_bfly = blst_fr_ct_bfly; pub const fr_gs_bfly = blst_fr_gs_bfly; pub const fr_to = blst_fr_to; pub const fr_from = blst_fr_from; pub const fp_to = blst_fp_to; pub const fp_from = blst_fp_from; pub const fp_is_square = blst_fp_is_square; pub const fp2_is_square = blst_fp2_is_square; pub const p1_from_jacobian = blst_p1_from_jacobian; pub const p2_from_jacobian = blst_p2_from_jacobian; pub const sk_to_pk2_in_g1 = blst_sk_to_pk2_in_g1; pub const sign_pk2_in_g1 = blst_sign_pk2_in_g1; pub const sk_to_pk2_in_g2 = blst_sk_to_pk2_in_g2; pub const sign_pk2_in_g2 = blst_sign_pk2_in_g2; pub const uniq = blst_uniq; pub const uniq_sizeof = blst_uniq_sizeof; pub const uniq_init = blst_uniq_init; pub const uniq_test = blst_uniq_test; pub const expand_message_xmd = blst_expand_message_xmd; pub const p1_unchecked_mult = blst_p1_unchecked_mult; pub const p2_unchecked_mult = blst_p2_unchecked_mult; pub const pairing_raw_aggregate = blst_pairing_raw_aggregate; pub const pairing_as_fp12 = blst_pairing_as_fp12; pub const bendian_from_fp12 = blst_bendian_from_fp12; pub const keygen_v3 = blst_keygen_v3; pub const keygen_v4_5 = blst_keygen_v4_5; pub const keygen_v5 = blst_keygen_v5; pub const derive_master_eip2333 = blst_derive_master_eip2333; pub const derive_child_eip2333 = blst_derive_child_eip2333; pub const scalar_from_hexascii = blst_scalar_from_hexascii; pub const fr_from_hexascii = blst_fr_from_hexascii; pub const fp_from_hexascii = blst_fp_from_hexascii; pub const p1_sizeof = blst_p1_sizeof; pub const p1_affine_sizeof = blst_p1_affine_sizeof; pub const p2_sizeof = blst_p2_sizeof; pub const p2_affine_sizeof = blst_p2_affine_sizeof; pub const fp12_sizeof = blst_fp12_sizeof; pub const fp_from_le_bytes = blst_fp_from_le_bytes; pub const fp_from_be_bytes = blst_fp_from_be_bytes; pub const sha256 = blst_sha256; ================================================ FILE: bindings/zig/generate.py ================================================ #!/usr/bin/env python3 import os, re, sys, subprocess top_zig = """ // Copyright Supranational LLC // SPDX-License-Identifier: Apache-2.0 const std = @import("std"); pub const c = @import("c.zig"); pub const Error = error{ BAD_ENCODING, POINT_NOT_ON_CURVE, POINT_NOT_IN_GROUP, AGGR_TYPE_MISMATCH, VERIFY_FAIL, PK_IS_INFINITY, BAD_SCALAR, Unknown, }; pub const ERROR = enum(c.ERROR) { SUCCESS = c.SUCCESS, BAD_ENCODING = c.BAD_ENCODING, POINT_NOT_ON_CURVE = c.POINT_NOT_ON_CURVE, POINT_NOT_IN_GROUP = c.POINT_NOT_IN_GROUP, AGGR_TYPE_MISMATCH = c.AGGR_TYPE_MISMATCH, VERIFY_FAIL = c.VERIFY_FAIL, PK_IS_INFINITY = c.PK_IS_INFINITY, BAD_SCALAR = c.BAD_SCALAR, pub fn as_error(self: ERROR) Error { return switch (self) { .BAD_ENCODING => Error.BAD_ENCODING, .POINT_NOT_ON_CURVE => Error.POINT_NOT_ON_CURVE, .POINT_NOT_IN_GROUP => Error.POINT_NOT_IN_GROUP, .AGGR_TYPE_MISMATCH => Error.AGGR_TYPE_MISMATCH, .VERIFY_FAIL => Error.VERIFY_FAIL, .PK_IS_INFINITY => Error.PK_IS_INFINITY, .BAD_SCALAR => Error.BAD_SCALAR, else => Error.Unknown, }; } }; pub const SecretKey = struct { key: c.scalar = c.scalar{}, pub fn keygen(self: *SecretKey, IKM: []const u8, info: ?[]const u8) void { const opt = info orelse &[_]u8{}; c.keygen(&self.key, @ptrCast(IKM), IKM.len, @ptrCast(opt), opt.len); } pub fn deinit(self: *SecretKey) void { self.key = c.scalar{}; } }; pub const PT = c.fp12; pub const Pairing = struct { ctx: []u64 = &[_]u64{}, allocator: std.mem.Allocator, pub fn init(hash_or_encode: bool, DST: []const u8, allocator: std.mem.Allocator) !Pairing { const nlimbs = (c.pairing_sizeof() + @sizeOf(u64) - 1) / @sizeOf(u64); const buffer = try allocator.alloc(u64, nlimbs); c.pairing_init(@ptrCast(buffer), hash_or_encode, &DST[0], DST.len); return Pairing{ .ctx = buffer, .allocator = allocator, }; } pub fn deinit(self: *Pairing) void { self.allocator.free(self.ctx); self.ctx = &[_]u64{}; } pub fn aggregate(self: *Pairing, pk: anytype, sig: anytype, msg: []const u8, aug: ?[]const u8) ERROR { const opt = aug orelse &[_]u8{}; var err: c.ERROR = undefined; switch (@TypeOf(pk)) { *const P1_Affine, *P1_Affine => { const sigp: [*c]const c.p2_affine = switch (@TypeOf(sig)) { @TypeOf(null) => null, else => &sig.point, }; err = c.pairing_aggregate_pk_in_g1(@ptrCast(self.ctx), &pk.point, sigp, @ptrCast(msg), msg.len, @ptrCast(opt), opt.len); }, *const P2_Affine, *P2_Affine => { const sigp: [*c]const c.p1_affine = switch (@TypeOf(sig)) { @TypeOf(null) => null, else => &sig.point, }; err = c.pairing_aggregate_pk_in_g2(@ptrCast(self.ctx), &pk.point, sigp, @ptrCast(msg), msg.len, @ptrCast(opt), opt.len); }, else => |T| @compileError("expected type '*const blst.P1_Affine' " ++ "or '*const blst.P2_Affine', found '" ++ @typeName(T) ++ "'"), } return @as(ERROR, @enumFromInt(err)); } pub fn commit(self: *Pairing) void { c.pairing_commit(@ptrCast(self.ctx)); } pub fn merge(self: *Pairing, second: *const Pairing) ERROR { return c.pairing_merge(@ptrCast(self.ctx), @ptrCast(second.ctx)); } pub fn finalverify(self: *Pairing, optional: ?*const PT) bool { return c.pairing_finalverify(@ptrCast(self.ctx), optional); } pub fn raw_aggregate(self: *Pairing, q: *const P2_Affine, p: *const P1_Affine) void { c.pairing_raw_aggregate(@ptrCast(self.ctx), &q.point, &p.point); } pub fn as_fp12(self: *Pairing) *const PT { return c.pairing_as_fp12(@ptrCast(self.ctx)); } }; pub const Uniq = struct { tree: []u64 = &[_]u64{}, allocator: std.mem.Allocator, pub fn init(n: usize, allocator: std.mem.Allocator) !Uniq { const nlimbs = (c.uniq_sizeof(n) + @sizeOf(u64) - 1) / @sizeOf(u64); const buffer = try allocator.alloc(u64, nlimbs); c.uniq_init(@ptrCast(buffer)); return Uniq{ .tree = buffer, .allocator = allocator, }; } pub fn deinit(self: *Uniq) void { self.allocator.free(self.tree); self.tree = &[_]u64{}; } pub fn is_uniq(self: *Uniq, msg: []const u8) bool { return c.uniq_test(@ptrCast(self.tree), @ptrCast(msg), msg.len); } }; const FP_BYTES = 384/8; pub const P1_COMPRESS_BYTES = FP_BYTES; pub const P1_SERIALIZE_BYTES = FP_BYTES*2; pub const P2_COMPRESS_BYTES = FP_BYTES*2; pub const P2_SERIALIZE_BYTES = FP_BYTES*4; """ p1_zig = """ pub const P1_Affine = struct { point: c.p1_affine = c.p1_affine{}, pub fn from(in: anytype) !P1_Affine { switch (@TypeOf(in)) { *const P1, *P1 => return in.to_affine(), P1 => @compileError("expected type '*const blst.P1', found 'blst.P1'"), else => |T| { switch (@typeInfo(T)) { .pointer => { const s: []const u8 = in; _ = s; }, else => @compileError("expected type '[]const u8', found '" ++ @typeName(T) ++ "'"), } var ret: P1_Affine = undefined; const err = ret.deserialize(in); return if (err == .SUCCESS) ret else err.as_error(); }, } unreachable; } pub fn deserialize(self: *P1_Affine, in: []const u8) ERROR { if (in.len == 0) { return .BAD_ENCODING; } const expected = @as(usize, if (in[0]&0x80 != 0) P1_COMPRESS_BYTES else P1_SERIALIZE_BYTES); if (in.len != expected) { return .BAD_ENCODING; } const err = c.p1_deserialize(&self.point, &in[0]); return @as(ERROR, @enumFromInt(err)); } pub fn serialize(self: *const P1_Affine) [P1_SERIALIZE_BYTES]u8 { var ret: [P1_SERIALIZE_BYTES]u8 = undefined; c.p1_affine_serialize(&ret[0], &self.point); return ret; } pub fn compress(self: *const P1_Affine) [P1_COMPRESS_BYTES]u8 { var ret: [P1_COMPRESS_BYTES]u8 = undefined; c.p1_affine_compress(&ret[0], &self.point); return ret; } pub fn dup(self: *const P1_Affine) P1_Affine { return self.*; } pub fn on_curve(self: *const P1_Affine) bool { return c.p1_affine_on_curve(&self.point); } pub fn in_group(self: *const P1_Affine) bool { return c.p1_affine_in_g1(&self.point); } pub fn is_inf(self: *const P1_Affine) bool { return c.p1_affine_is_inf(&self.point); } pub fn is_equal(self: *const P1_Affine, p: *const P1_Affine) bool { return c.p1_affine_is_equal(&self.point, &p.point); } pub fn core_verify(self: *const P1_Affine, pk: *const P2_Affine, hash_or_encode: bool, msg: []const u8, DST: []const u8, aug: ?[]const u8) ERROR { const opt = aug orelse &[_]u8{}; const err = c.core_verify_pk_in_g2(&pk.point, &self.point, hash_or_encode, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return @as(ERROR, @enumFromInt(err)); } pub fn generator() P1_Affine { return P1_Affine{ .point = c.p1_affine_generator().*, }; } pub fn to_jacobian(self: *const P1_Affine) P1 { var ret: P1 = undefined; c.p1_from_affine(&ret.point, &self.point); return ret; } }; pub const P1 = struct { point: c.p1 = c.p1{}, pub fn from(in: anytype) !P1 { switch (@TypeOf(in)) { *const SecretKey, *SecretKey => return P1.public_key(in), SecretKey => @compileError("expected type '*const blst.SecretKey', found 'blst.SecretKey'"), *const P1_Affine, *P1_Affine => return in.to_jacobian(), P1_Affine => @compileError("expected type '*const blst.P1_Affine', found 'blst.P1_Affine'"), else => |T| { switch (@typeInfo(T)) { .pointer => { const s: []const u8 = in; _ = s; }, else => @compileError("expected type '[]const u8', found '" ++ @typeName(T) ++ "'"), } var ret: P1 = undefined; const err = ret.deserialize(in); return if (err == .SUCCESS) ret else err.as_error(); }, } unreachable; } pub fn deserialize(self: *P1, in: []const u8) ERROR { if (in.len == 0) { return .BAD_ENCODING; } const expected = @as(usize, if (in[0]&0x80 != 0) P1_COMPRESS_BYTES else P1_SERIALIZE_BYTES); if (in.len != expected) { return .BAD_ENCODING; } const err = c.p1_deserialize(@ptrCast(&self.point), &in[0]); if (err == c.SUCCESS) { c.p1_from_affine(&self.point, @ptrCast(&self.point)); } return @as(ERROR, @enumFromInt(err)); } pub fn serialize(self: *const P1) [P1_SERIALIZE_BYTES]u8 { var ret: [P1_SERIALIZE_BYTES]u8 = undefined; c.p1_serialize(&ret[0], &self.point); return ret; } pub fn compress(self: *const P1) [P1_COMPRESS_BYTES]u8 { var ret: [P1_COMPRESS_BYTES]u8 = undefined; c.p1_compress(&ret[0], &self.point); return ret; } pub fn public_key(sk: *const SecretKey) P1 { var ret: P1 = undefined; c.sk_to_pk_in_g1(&ret.point, &sk.key); return ret; } pub fn dup(self: *const P1) P1 { return self.*; } pub fn on_curve(self: *const P1) bool { return c.p1_on_curve(&self.point); } pub fn in_group(self: *const P1) bool { return c.p1_in_g1(&self.point); } pub fn is_inf(self: *const P1) bool { return c.p1_is_inf(&self.point); } pub fn is_equal(self: *const P1, p: *const P1) bool { return c.p1_is_equal(&self.point, &p.point); } pub fn aggregate(self: *P1, p: *const P1_Affine) !void { if (!c.p1_affine_in_g1(&p.point)) { return Error.POINT_NOT_IN_GROUP; } c.p1_add_or_double_affine(&self.point, &self.point, &p.point); } pub fn hash_to(msg: []const u8, DST: []const u8, aug: ?[]const u8) P1 { const opt = aug orelse &[_]u8{}; var ret: P1 = undefined; c.hash_to_g1(&ret.point, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return ret; } pub fn encode_to(msg: []const u8, DST: []const u8, aug: ?[]const u8) P1 { const opt = aug orelse &[_]u8{}; var ret: P1 = undefined; c.encode_to_g1(&ret.point, @ptrCast(msg), msg.len, @ptrCast(DST), DST.len, @ptrCast(opt), opt.len); return ret; } pub fn sign_with(self: *const P1, sk: *const SecretKey) *P1 { c.sign_pk_in_g2(@constCast(&self.point), &self.point, &sk.key); return @constCast(self); } pub fn to_affine(self: *const P1) P1_Affine { var ret: P1_Affine = undefined; c.p1_to_affine(&ret.point, &self.point); return ret; } pub fn generator() P1 { return P1{ .point = c.p1_generator().*, }; } }; """ here = re.split(r'/(?=[^/]*$)', sys.argv[0]) if len(here) > 1: os.chdir(here[0]) def xchg_1vs2(matchobj): if matchobj.group(2) == '1': return matchobj.group(1) + '2' else: return matchobj.group(1) + '1' print("generating blst.zig...") or sys.stdout.flush() with open("blst.zig", "w") as fd: print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fd) print("// DO NOT EDIT THIS FILE!!!", file=fd) print("// The file is auto-generated by " + here[-1], file=fd) print("//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", file=fd) print(top_zig, file=fd) print(p1_zig, file=fd) print(re.sub(r'((? 1 rh = files[-1] if not os.path.exists(rh): return True for lh in files[:-1]: if os.stat(lh).st_mtime > os.stat(rh).st_mtime: return True return False if newer("../blst.h", "c.zig"): print("generating c.zig...") or sys.stdout.flush() ret = subprocess.run(["zig", "translate-c", "../blst.h", "-D__BLST_ZIG__"], capture_output=True, text=True) with open("c.zig", "w") as fd: pubs = {} print("// automatically generated with 'zig translate-c'", file=fd) for line in ret.stdout.splitlines(): if "no file" in line: break elif not line.startswith("pub const _"): m = re.match(r'^pub ([\w\s]+? ((?:blst|BLST)_(\w+)).*)', line) if m: print(m.group(1), file=fd) pubs[m.group(3)] = m.group(2) else: print(line, file=fd) print("// reexport symbols without blst_ prefix", file=fd) for key, val in pubs.items(): print("pub const {} = {};".format(key, val), file=fd) del pubs version = "0.3.16" os.chdir("../..") try: with open("build.zig.zon", "r") as fd: m = re.search(r'\.version = "([^"]+)"', fd.read()) if m and m.group(1) == version: sys.exit(0) except OSError as e: if e.errno != 2: # not "no such file or directory" raise e print("generating build.zig.zon...") or sys.stdout.flush() zon = """.{ .name = .blst, .version = "%s", .minimum_zig_version = "0.14.0", .paths = .{ "build.zig", "build.zig.zon", "bindings/zig", "src", "build", }, """ % version with open("build.zig.zon", "w") as fd: print(zon, end='', file=fd) print("}", file=fd) ret = subprocess.run(["zig", "build"], capture_output=True, text=True) match = re.search(r'suggested value:\s*(\w+)', ret.stderr) if match: with open("build.zig.zon", "w") as fd: print(zon, end='', file=fd) print(" .fingerprint = {},".format(match.group(1)), file=fd) print("}", file=fd) else: print("don't know what to do") ================================================ FILE: bindings/zig/tests.zig ================================================ const std = @import("std"); const blst = @import("blst"); test "sign/verify" { const password = [_]u8{'*'} ** 32; var SK = blst.SecretKey{}; SK.keygen(&password, null); defer SK.deinit(); const msg = "assertion"; const DST = "MY-DST"; // on the "sender" side... const pk_for_wire = (try blst.P1.from(&SK)).serialize(); const sig_for_wire = blst.P2.hash_to(msg, DST, &pk_for_wire).sign_with(&SK).serialize(); // ... and now on the "receiver" side... const sig = try blst.P2_Affine.from(&sig_for_wire); const pk = try blst.P1_Affine.from(&pk_for_wire); const ret = sig.core_verify(&pk, true, msg, DST, &pk_for_wire); try std.testing.expectEqual(ret, .SUCCESS); } test "uniq" { const msgs = &[_][]const u8 { "three", "two", "one", "three", }; var ctx = try blst.Uniq.init(msgs.len, std.testing.allocator); defer ctx.deinit(); for (msgs, 1..) |msg, next| { try std.testing.expectEqual(ctx.is_uniq(msg), next < msgs.len); } } fn box(allocator: std.mem.Allocator, src: []const u8) ![]u8 { const ret = try allocator.alloc(u8, src.len); @memcpy(ret, src); return ret; } test "aggregateverify" { const mem = std.testing.allocator; const N = 3; const password = [_]u8{'*'} ** 32; var SK = blst.SecretKey{}; defer SK.deinit(); // emulate N "senders"... const DST = "MY-DST"; var pks: [N][]const u8 = undefined; var sigs: [N][]const u8 = undefined; var msgs: [N][]const u8 = undefined; for (0..N) |i| { msgs[i] = try std.fmt.allocPrint(mem, "assertion{}", .{i}); SK.keygen(&password, msgs[i]); pks[i] = try box(mem, &(try blst.P1.from(&SK)).serialize()); sigs[i] = try box(mem, &blst.P2.hash_to(msgs[i], DST, null).sign_with(&SK).serialize()); } // ... basic scheme on the "receiver" side. var uniq = try blst.Uniq.init(msgs.len, mem); defer uniq.deinit(); // The basic scheme requires messages to be checked for uniqueness. for (0..N) |i| { try std.testing.expectEqual(uniq.is_uniq(msgs[i]), true); } var aggregated = try blst.P2.from(sigs[0]); try std.testing.expectEqual(aggregated.in_group(), true); for (1..N) |i| { try aggregated.aggregate(&try blst.P2_Affine.from(sigs[i])); } var ctx = try blst.Pairing.init(true, DST, mem); defer ctx.deinit(); // The below .aggregate() method doesn't vet public keys with // rationale that application would cache the results of the // group checks. Hence they need to be vetted separately. var pk = try blst.P1_Affine.from(pks[0]); try std.testing.expectEqual(pk.in_group(), true); try std.testing.expectEqual(ctx.aggregate(&pk, &aggregated.to_affine(), msgs[0], null), .SUCCESS); for (1..N) |i| { pk = try blst.P1_Affine.from(pks[i]); try std.testing.expectEqual(pk.in_group(), true); try std.testing.expectEqual(ctx.aggregate(&pk, null, msgs[i], null), .SUCCESS); } ctx.commit(); try std.testing.expectEqual(ctx.finalverify(null), true); for (0..N) |i| { mem.free(pks[i]); mem.free(sigs[i]); mem.free(msgs[i]); } } ================================================ FILE: build/assembly.S ================================================ #if defined(__x86_64) || defined(__x86_64__) # if defined(__ELF__) # if defined(__BLST_PORTABLE__) # include "elf/sha256-portable-x86_64.s" # define blst_sha256_block_data_order blst_sha256_block_ssse3 # endif # include "elf/sha256-x86_64.s" # if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/ctx_inverse_mod_384-x86_64.s" # endif # if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/ctq_inverse_mod_384-x86_64.s" # endif # include "elf/add_mod_384-x86_64.s" # include "elf/add_mod_384x384-x86_64.s" # if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/mulx_mont_384-x86_64.s" # include "elf/mulx_mont_256-x86_64.s" # endif # if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/mulq_mont_384-x86_64.s" # include "elf/mulq_mont_256-x86_64.s" # endif # include "elf/add_mod_256-x86_64.s" # include "elf/ct_inverse_mod_256-x86_64.s" # include "elf/div3w-x86_64.s" # include "elf/ct_is_square_mod_384-x86_64.s" # elif defined(_WIN64) || defined(__CYGWIN__) # if defined(__BLST_PORTABLE__) # include "coff/sha256-portable-x86_64.s" # define blst_sha256_block_data_order blst_sha256_block_ssse3 # define LSEH_begin_blst_sha256_block_data_order LSEH_begin_blst_sha256_block_ssse3 # define LSEH_body_blst_sha256_block_data_order LSEH_body_blst_sha256_block_ssse3 # define LSEH_info_blst_sha256_block_data_order_prologue LSEH_info_blst_sha256_block_ssse3_prologue # define LSEH_body_blst_sha256_block_data_order LSEH_body_blst_sha256_block_ssse3 # define LSEH_epilogue_blst_sha256_block_data_order LSEH_epilogue_blst_sha256_block_ssse3 # define LSEH_info_blst_sha256_block_data_order_body LSEH_info_blst_sha256_block_ssse3_body # define LSEH_epilogue_blst_sha256_block_data_order LSEH_epilogue_blst_sha256_block_ssse3 # define LSEH_end_blst_sha256_block_data_order LSEH_end_blst_sha256_block_ssse3 # define LSEH_info_blst_sha256_block_data_order_epilogue LSEH_info_blst_sha256_block_ssse3_epilogue # endif # include "coff/sha256-x86_64.s" # if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/ctx_inverse_mod_384-x86_64.s" # endif # if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/ctq_inverse_mod_384-x86_64.s" # endif # include "coff/add_mod_384-x86_64.s" # include "coff/add_mod_384x384-x86_64.s" # if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/mulx_mont_384-x86_64.s" # include "coff/mulx_mont_256-x86_64.s" # endif # if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/mulq_mont_384-x86_64.s" # include "coff/mulq_mont_256-x86_64.s" # endif # include "coff/add_mod_256-x86_64.s" # include "coff/ct_inverse_mod_256-x86_64.s" # include "coff/div3w-x86_64.s" # include "coff/ct_is_square_mod_384-x86_64.s" # elif defined(__APPLE__) # include "mach-o/sha256-x86_64.s" # if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/ctx_inverse_mod_384-x86_64.s" # endif # if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/ctq_inverse_mod_384-x86_64.s" # endif # include "mach-o/add_mod_384-x86_64.s" # include "mach-o/add_mod_384x384-x86_64.s" # if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/mulx_mont_384-x86_64.s" # include "mach-o/mulx_mont_256-x86_64.s" # endif # if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/mulq_mont_384-x86_64.s" # include "mach-o/mulq_mont_256-x86_64.s" # endif # include "mach-o/add_mod_256-x86_64.s" # include "mach-o/ct_inverse_mod_256-x86_64.s" # include "mach-o/div3w-x86_64.s" # include "mach-o/ct_is_square_mod_384-x86_64.s" # endif #elif defined(__aarch64__) # if defined(__CHERI_PURE_CAPABILITY__) # include "cheri/sha256-armv8.S" # include "cheri/ct_inverse_mod_384-armv8.S" # include "cheri/add_mod_384-armv8.S" # define __add_mod_384 __add_mont_384 # define __sub_mod_384 __sub_mont_384 # include "cheri/mul_mont_384-armv8.S" # include "cheri/mul_mont_256-armv8.S" # include "cheri/add_mod_256-armv8.S" # include "cheri/ct_inverse_mod_256-armv8.S" # include "cheri/div3w-armv8.S" # include "cheri/ct_is_square_mod_384-armv8.S" # elif defined(__ELF__) # include "elf/sha256-armv8.S" # include "elf/ct_inverse_mod_384-armv8.S" # include "elf/add_mod_384-armv8.S" # define __add_mod_384 __add_mont_384 # define __sub_mod_384 __sub_mont_384 # include "elf/mul_mont_384-armv8.S" # include "elf/mul_mont_256-armv8.S" # include "elf/add_mod_256-armv8.S" # include "elf/ct_inverse_mod_256-armv8.S" # include "elf/div3w-armv8.S" # include "elf/ct_is_square_mod_384-armv8.S" # elif defined(_WIN64) # include "coff/sha256-armv8.S" # include "coff/ct_inverse_mod_384-armv8.S" # include "coff/add_mod_384-armv8.S" # define __add_mod_384 __add_mont_384 # define __sub_mod_384 __sub_mont_384 # include "coff/mul_mont_384-armv8.S" # include "coff/mul_mont_256-armv8.S" # include "coff/add_mod_256-armv8.S" # include "coff/ct_inverse_mod_256-armv8.S" # include "coff/div3w-armv8.S" # include "coff/ct_is_square_mod_384-armv8.S" # elif defined(__APPLE__) # include "mach-o/sha256-armv8.S" # include "mach-o/ct_inverse_mod_384-armv8.S" # include "mach-o/add_mod_384-armv8.S" # define __add_mod_384 __add_mont_384 # define __sub_mod_384 __sub_mont_384 # include "mach-o/mul_mont_384-armv8.S" # include "mach-o/mul_mont_256-armv8.S" # include "mach-o/add_mod_256-armv8.S" # include "mach-o/ct_inverse_mod_256-armv8.S" # include "mach-o/div3w-armv8.S" # include "mach-o/ct_is_square_mod_384-armv8.S" # endif #elif defined(__BLST_NO_ASM__) || \ (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) /* inaccurate way to detect a 32-bit processor, but it's close enough */ #else # error "unsupported platform" #endif ================================================ FILE: build/bindings_trim.pl ================================================ #!/usr/bin/env perl # read whole file while(<>) { push @file, $_; } # traverse and remove auto-generated PartialEq for chosen types for (my $i = 0; $i <= $#file; $i++) { if (@file[$i] =~ m/pub\s+(?:struct|enum)\s+(\w+)/) { push @structs, $1; } if (@file[$i] =~ m/struct\s+blst_p[12]/) { @file[$i-1] =~ s/,\s*PartialEq//; } elsif (@file[$i] =~ m/struct\s+blst_fp12/) { @file[$i-1] =~ s/,\s*(?:Default|PartialEq)//g; } elsif (@file[$i] =~ m/struct\s+(blst_pairing|blst_uniq)/) { @file[$i-1] =~ s/,\s*(?:Copy|Clone|Eq|PartialEq)//g; } elsif (@file[$i] =~ m/struct\s+blst_scalar/) { @file[$i-1] =~ s/,\s*Copy//; @file[$i-1] =~ s/\)/, Zeroize\)/; splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++; } else { @file[$i] =~ s/::std::/::core::/g; } } print @file; print << '___'; #[test] fn bindgen_test_normal_types() { // from "Rust for Rustaceans" by Jon Gjengset fn is_normal() {} ___ for (@structs) { print " is_normal::<$_>();\n"; } print "}\n"; close STDOUT; ================================================ FILE: build/cheri/add_mod_256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl add_mod_256 .hidden add_mod_256 .type add_mod_256,%function .align 5 add_mod_256: hint #34 ldp x8,x9,[c1] ldp x12,x13,[c2] ldp x10,x11,[c1,#16] adds x8,x8,x12 ldp x14,x15,[c2,#16] adcs x9,x9,x13 ldp x4,x5,[c3] adcs x10,x10,x14 ldp x6,x7,[c3,#16] adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[c0] csel x11,x11,x2,lo stp x10,x11,[c0,#16] ret .size add_mod_256,.-add_mod_256 .globl mul_by_3_mod_256 .hidden mul_by_3_mod_256 .type mul_by_3_mod_256,%function .align 5 mul_by_3_mod_256: hint #34 ldp x12,x13,[c1] ldp x14,x15,[c1,#16] adds x8,x12,x12 ldp x4,x5,[c2] adcs x9,x13,x13 ldp x6,x7,[c2,#16] adcs x10,x14,x14 adcs x11,x15,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo adds x8,x8,x12 adcs x9,x9,x13 adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[c0] csel x11,x11,x2,lo stp x10,x11,[c0,#16] ret .size mul_by_3_mod_256,.-mul_by_3_mod_256 .globl lshift_mod_256 .hidden lshift_mod_256 .type lshift_mod_256,%function .align 5 lshift_mod_256: hint #34 ldp x8,x9,[c1] ldp x10,x11,[c1,#16] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] .Loop_lshift_mod_256: adds x8,x8,x8 sub x2,x2,#1 adcs x9,x9,x9 adcs x10,x10,x10 adcs x11,x11,x11 adc x3,xzr,xzr subs x12,x8,x4 sbcs x13,x9,x5 sbcs x14,x10,x6 sbcs x15,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x12,lo csel x9,x9,x13,lo csel x10,x10,x14,lo csel x11,x11,x15,lo cbnz x2,.Loop_lshift_mod_256 stp x8,x9,[c0] stp x10,x11,[c0,#16] ret .size lshift_mod_256,.-lshift_mod_256 .globl rshift_mod_256 .hidden rshift_mod_256 .type rshift_mod_256,%function .align 5 rshift_mod_256: hint #34 ldp x8,x9,[c1] ldp x10,x11,[c1,#16] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] .Loop_rshift: adds x12,x8,x4 sub x2,x2,#1 adcs x13,x9,x5 adcs x14,x10,x6 adcs x15,x11,x7 adc x3,xzr,xzr tst x8,#1 csel x12,x12,x8,ne csel x13,x13,x9,ne csel x14,x14,x10,ne csel x15,x15,x11,ne csel x3,x3,xzr,ne extr x8,x13,x12,#1 extr x9,x14,x13,#1 extr x10,x15,x14,#1 extr x11,x3,x15,#1 cbnz x2,.Loop_rshift stp x8,x9,[c0] stp x10,x11,[c0,#16] ret .size rshift_mod_256,.-rshift_mod_256 .globl cneg_mod_256 .hidden cneg_mod_256 .type cneg_mod_256,%function .align 5 cneg_mod_256: ldp x8,x9,[c1] ldp x4,x5,[c3] ldp x10,x11,[c1,#16] subs x12,x4,x8 ldp x6,x7,[c3,#16] orr x4,x8,x9 sbcs x13,x5,x9 orr x5,x10,x11 sbcs x14,x6,x10 orr x3,x4,x5 sbc x15,x7,x11 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x8,x8,x12,eq csel x9,x9,x13,eq csel x10,x10,x14,eq stp x8,x9,[c0] csel x11,x11,x15,eq stp x10,x11,[c0,#16] ret .size cneg_mod_256,.-cneg_mod_256 .globl sub_mod_256 .hidden sub_mod_256 .type sub_mod_256,%function .align 5 sub_mod_256: ldp x8,x9,[c1] ldp x12,x13,[c2] ldp x10,x11,[c1,#16] subs x8,x8,x12 ldp x14,x15,[c2,#16] sbcs x9,x9,x13 ldp x4,x5,[c3] sbcs x10,x10,x14 ldp x6,x7,[c3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 stp x8,x9,[c0] adc x11,x11,x7 stp x10,x11,[c0,#16] ret .size sub_mod_256,.-sub_mod_256 .globl check_mod_256 .hidden check_mod_256 .type check_mod_256,%function .align 5 check_mod_256: ldp x8,x9,[c0] ldp x10,x11,[c0,#16] ldp x4,x5,[c1] ldp x6,x7,[c1,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif subs xzr,x8,x4 sbcs xzr,x9,x5 orr x8,x8,x9 sbcs xzr,x10,x6 orr x8,x8,x10 sbcs xzr,x11,x7 orr x8,x8,x11 sbc x1,xzr,xzr cmp x8,#0 mov x0,#1 csel x0,x0,xzr,ne and x0,x0,x1 ret .size check_mod_256,.-check_mod_256 .globl add_n_check_mod_256 .hidden add_n_check_mod_256 .type add_n_check_mod_256,%function .align 5 add_n_check_mod_256: ldp x8,x9,[c1] ldp x12,x13,[c2] ldp x10,x11,[c1,#16] ldp x14,x15,[c2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif adds x8,x8,x12 ldp x4,x5,[c3] adcs x9,x9,x13 ldp x6,x7,[c3,#16] adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[c0] stp x10,x11,[c0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret .size add_n_check_mod_256,.-add_n_check_mod_256 .globl sub_n_check_mod_256 .hidden sub_n_check_mod_256 .type sub_n_check_mod_256,%function .align 5 sub_n_check_mod_256: ldp x8,x9,[c1] ldp x12,x13,[c2] ldp x10,x11,[c1,#16] ldp x14,x15,[c2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif subs x8,x8,x12 sbcs x9,x9,x13 ldp x4,x5,[c3] sbcs x10,x10,x14 ldp x6,x7,[c3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 adc x11,x11,x7 orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[c0] stp x10,x11,[c0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret .size sub_n_check_mod_256,.-sub_n_check_mod_256 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/add_mod_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl add_mod_384 .hidden add_mod_384 .type add_mod_384,%function .align 5 add_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] ldp x8,x9,[c3,#32] bl __add_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0] stp x12,x13,[c0,#16] stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size add_mod_384,.-add_mod_384 .type __add_mod_384,%function .align 5 __add_mod_384: ldp x10,x11,[c1] ldp x16,x17,[c2] ldp x12,x13,[c1,#16] ldp x19,x20,[c2,#16] ldp x14,x15,[c1,#32] ldp x21,x22,[c2,#32] __add_mod_384_ab_are_loaded: adds x10,x10,x16 adcs x11,x11,x17 adcs x12,x12,x19 adcs x13,x13,x20 adcs x14,x14,x21 adcs x15,x15,x22 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .size __add_mod_384,.-__add_mod_384 .globl add_mod_384x .hidden add_mod_384x .type add_mod_384x,%function .align 5 add_mod_384x: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] ldp x8,x9,[c3,#32] bl __add_mod_384 stp x10,x11,[c0] add c1,c1,#48 stp x12,x13,[c0,#16] add c2,c2,#48 stp x14,x15,[c0,#32] bl __add_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0,#48] stp x12,x13,[c0,#64] stp x14,x15,[c0,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size add_mod_384x,.-add_mod_384x .globl rshift_mod_384 .hidden rshift_mod_384 .type rshift_mod_384,%function .align 5 rshift_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x12,x13,[c1,#16] ldp x14,x15,[c1,#32] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] ldp x8,x9,[c3,#32] .Loop_rshift_mod_384: sub x2,x2,#1 bl __rshift_mod_384 cbnz x2,.Loop_rshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0] stp x12,x13,[c0,#16] stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size rshift_mod_384,.-rshift_mod_384 .type __rshift_mod_384,%function .align 5 __rshift_mod_384: sbfx x22,x10,#0,#1 and x16,x22,x4 and x17,x22,x5 adds x10,x10,x16 and x19,x22,x6 adcs x11,x11,x17 and x20,x22,x7 adcs x12,x12,x19 and x21,x22,x8 adcs x13,x13,x20 and x22,x22,x9 adcs x14,x14,x21 extr x10,x11,x10,#1 // a[0:5] >>= 1 adcs x15,x15,x22 extr x11,x12,x11,#1 adc x22,xzr,xzr extr x12,x13,x12,#1 extr x13,x14,x13,#1 extr x14,x15,x14,#1 extr x15,x22,x15,#1 ret .size __rshift_mod_384,.-__rshift_mod_384 .globl div_by_2_mod_384 .hidden div_by_2_mod_384 .type div_by_2_mod_384,%function .align 5 div_by_2_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x12,x13,[c1,#16] ldp x14,x15,[c1,#32] ldp x4,x5,[c2] ldp x6,x7,[c2,#16] ldp x8,x9,[c2,#32] bl __rshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0] stp x12,x13,[c0,#16] stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size div_by_2_mod_384,.-div_by_2_mod_384 .globl lshift_mod_384 .hidden lshift_mod_384 .type lshift_mod_384,%function .align 5 lshift_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x12,x13,[c1,#16] ldp x14,x15,[c1,#32] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] ldp x8,x9,[c3,#32] .Loop_lshift_mod_384: sub x2,x2,#1 bl __lshift_mod_384 cbnz x2,.Loop_lshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0] stp x12,x13,[c0,#16] stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size lshift_mod_384,.-lshift_mod_384 .type __lshift_mod_384,%function .align 5 __lshift_mod_384: adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .size __lshift_mod_384,.-__lshift_mod_384 .globl mul_by_3_mod_384 .hidden mul_by_3_mod_384 .type mul_by_3_mod_384,%function .align 5 mul_by_3_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x12,x13,[c1,#16] ldp x14,x15,[c1,#32] ldp x4,x5,[c2] ldp x6,x7,[c2,#16] ldp x8,x9,[c2,#32] bl __lshift_mod_384 ldp x16,x17,[c1] ldp x19,x20,[c1,#16] ldp x21,x22,[c1,#32] bl __add_mod_384_ab_are_loaded ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0] stp x12,x13,[c0,#16] stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_3_mod_384,.-mul_by_3_mod_384 .globl mul_by_8_mod_384 .hidden mul_by_8_mod_384 .type mul_by_8_mod_384,%function .align 5 mul_by_8_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x12,x13,[c1,#16] ldp x14,x15,[c1,#32] ldp x4,x5,[c2] ldp x6,x7,[c2,#16] ldp x8,x9,[c2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0] stp x12,x13,[c0,#16] stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_8_mod_384,.-mul_by_8_mod_384 .globl mul_by_3_mod_384x .hidden mul_by_3_mod_384x .type mul_by_3_mod_384x,%function .align 5 mul_by_3_mod_384x: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x12,x13,[c1,#16] ldp x14,x15,[c1,#32] ldp x4,x5,[c2] ldp x6,x7,[c2,#16] ldp x8,x9,[c2,#32] bl __lshift_mod_384 ldp x16,x17,[c1] ldp x19,x20,[c1,#16] ldp x21,x22,[c1,#32] bl __add_mod_384_ab_are_loaded stp x10,x11,[c0] ldp x10,x11,[c1,#48] stp x12,x13,[c0,#16] ldp x12,x13,[c1,#64] stp x14,x15,[c0,#32] ldp x14,x15,[c1,#80] bl __lshift_mod_384 ldp x16,x17,[c1,#48] ldp x19,x20,[c1,#64] ldp x21,x22,[c1,#80] bl __add_mod_384_ab_are_loaded ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0,#48] stp x12,x13,[c0,#64] stp x14,x15,[c0,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_3_mod_384x,.-mul_by_3_mod_384x .globl mul_by_8_mod_384x .hidden mul_by_8_mod_384x .type mul_by_8_mod_384x,%function .align 5 mul_by_8_mod_384x: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x12,x13,[c1,#16] ldp x14,x15,[c1,#32] ldp x4,x5,[c2] ldp x6,x7,[c2,#16] ldp x8,x9,[c2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 stp x10,x11,[c0] ldp x10,x11,[c1,#48] stp x12,x13,[c0,#16] ldp x12,x13,[c1,#64] stp x14,x15,[c0,#32] ldp x14,x15,[c1,#80] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0,#48] stp x12,x13,[c0,#64] stp x14,x15,[c0,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_8_mod_384x,.-mul_by_8_mod_384x .globl cneg_mod_384 .hidden cneg_mod_384 .type cneg_mod_384,%function .align 5 cneg_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldp x4,x5,[c3] ldp x12,x13,[c1,#16] ldp x6,x7,[c3,#16] subs x16,x4,x10 ldp x14,x15,[c1,#32] ldp x8,x9,[c3,#32] orr x3,x10,x11 sbcs x17,x5,x11 orr x3,x3,x12 sbcs x19,x6,x12 orr x3,x3,x13 sbcs x20,x7,x13 orr x3,x3,x14 sbcs x21,x8,x14 orr x3,x3,x15 sbc x22,x9,x15 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x10,x10,x16,eq csel x11,x11,x17,eq csel x12,x12,x19,eq csel x13,x13,x20,eq stp x10,x11,[c0] csel x14,x14,x21,eq stp x12,x13,[c0,#16] csel x15,x15,x22,eq stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size cneg_mod_384,.-cneg_mod_384 .globl sub_mod_384 .hidden sub_mod_384 .type sub_mod_384,%function .align 5 sub_mod_384: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] ldp x8,x9,[c3,#32] bl __sub_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0] stp x12,x13,[c0,#16] stp x14,x15,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sub_mod_384,.-sub_mod_384 .type __sub_mod_384,%function .align 5 __sub_mod_384: ldp x10,x11,[c1] ldp x16,x17,[c2] ldp x12,x13,[c1,#16] ldp x19,x20,[c2,#16] ldp x14,x15,[c1,#32] ldp x21,x22,[c2,#32] subs x10,x10,x16 sbcs x11,x11,x17 sbcs x12,x12,x19 sbcs x13,x13,x20 sbcs x14,x14,x21 sbcs x15,x15,x22 sbc x3,xzr,xzr and x16,x4,x3 and x17,x5,x3 adds x10,x10,x16 and x19,x6,x3 adcs x11,x11,x17 and x20,x7,x3 adcs x12,x12,x19 and x21,x8,x3 adcs x13,x13,x20 and x22,x9,x3 adcs x14,x14,x21 adc x15,x15,x22 ret .size __sub_mod_384,.-__sub_mod_384 .globl sub_mod_384x .hidden sub_mod_384x .type sub_mod_384x,%function .align 5 sub_mod_384x: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x4,x5,[c3] ldp x6,x7,[c3,#16] ldp x8,x9,[c3,#32] bl __sub_mod_384 stp x10,x11,[c0] add c1,c1,#48 stp x12,x13,[c0,#16] add c2,c2,#48 stp x14,x15,[c0,#32] bl __sub_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0,#48] stp x12,x13,[c0,#64] stp x14,x15,[c0,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sub_mod_384x,.-sub_mod_384x .globl mul_by_1_plus_i_mod_384x .hidden mul_by_1_plus_i_mod_384x .type mul_by_1_plus_i_mod_384x,%function .align 5 mul_by_1_plus_i_mod_384x: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x4,x5,[c2] ldp x6,x7,[c2,#16] ldp x8,x9,[c2,#32] add c2,c1,#48 bl __sub_mod_384 // a->re - a->im ldp x16,x17,[c1] ldp x19,x20,[c1,#16] ldp x21,x22,[c1,#32] stp x10,x11,[c0] ldp x10,x11,[c1,#48] stp x12,x13,[c0,#16] ldp x12,x13,[c1,#64] stp x14,x15,[c0,#32] ldp x14,x15,[c1,#80] bl __add_mod_384_ab_are_loaded // a->re + a->im ldr c30,[csp,#__SIZEOF_POINTER__] stp x10,x11,[c0,#48] stp x12,x13,[c0,#64] stp x14,x15,[c0,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x .globl sgn0_pty_mod_384 .hidden sgn0_pty_mod_384 .type sgn0_pty_mod_384,%function .align 5 sgn0_pty_mod_384: hint #34 ldp x10,x11,[c0] ldp x12,x13,[c0,#16] ldp x14,x15,[c0,#32] ldp x4,x5,[c1] ldp x6,x7,[c1,#16] ldp x8,x9,[c1,#32] and x0,x10,#1 adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x3,x3,xzr mvn x3,x3 and x3,x3,#2 orr x0,x0,x3 ret .size sgn0_pty_mod_384,.-sgn0_pty_mod_384 .globl sgn0_pty_mod_384x .hidden sgn0_pty_mod_384x .type sgn0_pty_mod_384x,%function .align 5 sgn0_pty_mod_384x: hint #34 ldp x10,x11,[c0] ldp x12,x13,[c0,#16] ldp x14,x15,[c0,#32] ldp x4,x5,[c1] ldp x6,x7,[c1,#16] ldp x8,x9,[c1,#32] and x2,x10,#1 orr x3,x10,x11 adds x10,x10,x10 orr x3,x3,x12 adcs x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr ldp x10,x11,[c0,#48] ldp x12,x13,[c0,#64] ldp x14,x15,[c0,#80] mvn x16,x16 and x16,x16,#2 orr x2,x2,x16 and x0,x10,#1 orr x1,x10,x11 adds x10,x10,x10 orr x1,x1,x12 adcs x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr mvn x16,x16 and x16,x16,#2 orr x0,x0,x16 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ret .size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x .globl vec_select_32 .hidden vec_select_32 .type vec_select_32,%function .align 5 vec_select_32: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d}, [c1] cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d}, [c2] bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b st1 {v0.2d, v1.2d}, [c0] ret .size vec_select_32,.-vec_select_32 .globl vec_select_48 .hidden vec_select_48 .type vec_select_48,%function .align 5 vec_select_48: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0] ret .size vec_select_48,.-vec_select_48 .globl vec_select_96 .hidden vec_select_96 .type vec_select_96,%function .align 5 vec_select_96: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [c1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [c2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [c0] ret .size vec_select_96,.-vec_select_96 .globl vec_select_192 .hidden vec_select_192 .type vec_select_192,%function .align 5 vec_select_192: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [c1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [c2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [c0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [c1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [c2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [c0] ret .size vec_select_192,.-vec_select_192 .globl vec_select_144 .hidden vec_select_144 .type vec_select_144,%function .align 5 vec_select_144: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [c1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [c2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [c0],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0] ret .size vec_select_144,.-vec_select_144 .globl vec_select_288 .hidden vec_select_288 .type vec_select_288,%function .align 5 vec_select_288: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [c1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [c2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [c0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [c1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [c2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [c1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [c2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [c0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [c1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [c2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [c0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [c0] ret .size vec_select_288,.-vec_select_288 .globl vec_prefetch .hidden vec_prefetch .type vec_prefetch,%function .align 5 vec_prefetch: hint #34 add x1, x1, x0 sub x1, x1, #1 mov x2, #64 prfm pldl1keep, [c0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [c0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [c0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [c0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [c0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [c0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi prfm pldl1keep, [c0] ret .size vec_prefetch,.-vec_prefetch .globl vec_is_zero_16x .hidden vec_is_zero_16x .type vec_is_zero_16x,%function .align 5 vec_is_zero_16x: hint #34 ld1 {v0.2d}, [c0], #16 lsr x1, x1, #4 sub x1, x1, #1 cbz x1, .Loop_is_zero_done .Loop_is_zero: ld1 {v1.2d}, [c0], #16 orr v0.16b, v0.16b, v1.16b sub x1, x1, #1 cbnz x1, .Loop_is_zero .Loop_is_zero_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .size vec_is_zero_16x,.-vec_is_zero_16x .globl vec_is_equal_16x .hidden vec_is_equal_16x .type vec_is_equal_16x,%function .align 5 vec_is_equal_16x: hint #34 ld1 {v0.2d}, [c0], #16 ld1 {v1.2d}, [c1], #16 lsr x2, x2, #4 eor v0.16b, v0.16b, v1.16b .Loop_is_equal: sub x2, x2, #1 cbz x2, .Loop_is_equal_done ld1 {v1.2d}, [c0], #16 ld1 {v2.2d}, [c1], #16 eor v1.16b, v1.16b, v2.16b orr v0.16b, v0.16b, v1.16b b .Loop_is_equal nop .Loop_is_equal_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .size vec_is_equal_16x,.-vec_is_equal_16x #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/ct_inverse_mod_256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl ct_inverse_mod_256 .hidden ct_inverse_mod_256 .type ct_inverse_mod_256, %function .align 5 ct_inverse_mod_256: hint #PACI_HINT stp c29, c30, [csp,#-10*__SIZEOF_POINTER__]! add c29, csp, #0 stp c19, c20, [csp,#2*__SIZEOF_POINTER__] stp c21, c22, [csp,#4*__SIZEOF_POINTER__] stp c23, c24, [csp,#6*__SIZEOF_POINTER__] stp c25, c26, [csp,#8*__SIZEOF_POINTER__] sub csp, csp, #1040 ldp x4, x5, [c1,#8*0] ldp x6, x7, [c1,#8*2] #ifdef __CHERI_PURE_CAPABILITY__ add c1,csp,#16+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #16+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif str c0, [csp] // offload out_ptr ldp x8, x9, [c2,#8*0] ldp x10, x11, [c2,#8*2] stp x4, x5, [c1,#8*0] // copy input to |a| stp x6, x7, [c1,#8*2] stp x8, x9, [c1,#8*4] // copy modulus to |b| stp x10, x11, [c1,#8*6] ////////////////////////////////////////// first iteration bl .Lab_approximation_31_256_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 str x12,[c0,#8*8] // initialize |u| with |f0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 str x12, [c0,#8*10] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 ldr x8, [c1,#8*8] // |u| ldr x9, [c1,#8*14] // |v| madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| asr x5, x4, #63 // sign extension stp x4, x5, [c0,#8*4] stp x5, x5, [c0,#8*6] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| asr x5, x4, #63 // sign extension stp x4, x5, [c0,#8*10] stp x5, x5, [c0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 asr x24, x24, #63 str x24, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 asr x24, x24, #63 // sign extension stp x24, x24, [c0,#8*4] stp x24, x24, [c0,#8*6] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add c0,c0,#8*4 bl __smul_256_n_shift_by_31 add c0,c0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [c0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add c0,c0,#8*6 bl __smul_256x63 bl __smul_512x63_tail ////////////////////////////////////////// two[!] last iterations eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #47 // 31 + 512 % 31 //bl __ab_approximation_62_256 // |a| and |b| are exact, ldr x7, [c1,#8*0] // just load ldr x11, [c1,#8*4] bl __inner_loop_62_256 mov x16, x14 mov x17, x15 ldr c0, [csp] // original out_ptr bl __smul_256x63 bl __smul_512x63_tail ldr c30, [c29,#__SIZEOF_POINTER__] smulh x20, x7, x17 // figure out top-most limb ldp x8, x9, [c3,#8*0] adc x23, x23, x25 ldp x10, x11, [c3,#8*2] add x20, x20, x23 // x20 is 1, 0 or -1 asr x19, x20, #63 // sign as mask and x23, x8, x19 // add mod<<256 conditionally and x24, x9, x19 adds x4, x4, x23 and x25, x10, x19 adcs x5, x5, x24 and x26, x11, x19 adcs x6, x6, x25 adcs x7, x22, x26 adc x20, x20, xzr // x20 is 1, 0 or -1 neg x19, x20 orr x20, x20, x19 // excess bit or sign as mask asr x19, x19, #63 // excess bit as mask and x8, x8, x20 // mask |mod| and x9, x9, x20 and x10, x10, x20 and x11, x11, x20 eor x8, x8, x19 // conditionally negate |mod| eor x9, x9, x19 adds x8, x8, x19, lsr#63 eor x10, x10, x19 adcs x9, x9, xzr eor x11, x11, x19 adcs x10, x10, xzr adc x11, x11, xzr adds x4, x4, x8 // final adjustment for |mod|<<256 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [c0,#8*4] adc x7, x7, x11 stp x6, x7, [c0,#8*6] add csp, csp, #1040 ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] ldr c29, [csp],#10*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size ct_inverse_mod_256,.-ct_inverse_mod_256 //////////////////////////////////////////////////////////////////////// .type __smul_256x63, %function .align 5 __smul_256x63: ldp x4, x5, [c1,#8*0+64] // load |u| (or |v|) asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) ldp x6, x7, [c1,#8*2+64] eor x16, x16, x14 // conditionally negate |f_| (or |g_|) ldr x22, [c1,#8*4+64] eor x4, x4, x14 // conditionally negate |u| (or |v|) sub x16, x16, x14 eor x5, x5, x14 adds x4, x4, x14, lsr#63 eor x6, x6, x14 adcs x5, x5, xzr eor x7, x7, x14 adcs x6, x6, xzr eor x22, x22, x14 umulh x19, x4, x16 adcs x7, x7, xzr umulh x20, x5, x16 adcs x22, x22, xzr umulh x21, x6, x16 mul x4, x4, x16 cmp x16, #0 mul x5, x5, x16 csel x22, x22, xzr, ne mul x6, x6, x16 adds x5, x5, x19 mul x24, x7, x16 adcs x6, x6, x20 adcs x24, x24, x21 adc x26, xzr, xzr ldp x8, x9, [c1,#8*0+112] // load |u| (or |v|) asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) ldp x10, x11, [c1,#8*2+112] eor x17, x17, x14 // conditionally negate |f_| (or |g_|) ldr x23, [c1,#8*4+112] eor x8, x8, x14 // conditionally negate |u| (or |v|) sub x17, x17, x14 eor x9, x9, x14 adds x8, x8, x14, lsr#63 eor x10, x10, x14 adcs x9, x9, xzr eor x11, x11, x14 adcs x10, x10, xzr eor x23, x23, x14 umulh x19, x8, x17 adcs x11, x11, xzr umulh x20, x9, x17 adcs x23, x23, xzr umulh x21, x10, x17 adc x15, xzr, xzr // used in __smul_512x63_tail mul x8, x8, x17 cmp x17, #0 mul x9, x9, x17 csel x23, x23, xzr, ne mul x10, x10, x17 adds x9, x9, x19 mul x25, x11, x17 adcs x10, x10, x20 adcs x25, x25, x21 adc x26, x26, xzr adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [c0,#8*0] adcs x24, x24, x25 stp x6, x24, [c0,#8*2] ret .size __smul_256x63,.-__smul_256x63 .type __smul_512x63_tail, %function .align 5 __smul_512x63_tail: umulh x24, x7, x16 ldr x5, [c1,#8*19] // load rest of |v| adc x26, x26, xzr ldp x6, x7, [c1,#8*20] and x22, x22, x16 umulh x11, x11, x17 // resume |v|*|g1| chain sub x24, x24, x22 // tie up |u|*|f1| chain asr x25, x24, #63 eor x5, x5, x14 // conditionally negate rest of |v| eor x6, x6, x14 adds x5, x5, x15 eor x7, x7, x14 adcs x6, x6, xzr umulh x19, x23, x17 adc x7, x7, xzr umulh x20, x5, x17 add x11, x11, x26 umulh x21, x6, x17 mul x4, x23, x17 mul x5, x5, x17 adds x4, x4, x11 mul x6, x6, x17 adcs x5, x5, x19 mul x22, x7, x17 adcs x6, x6, x20 adcs x22, x22, x21 adc x23, xzr, xzr // used in the final step adds x4, x4, x24 adcs x5, x5, x25 adcs x6, x6, x25 stp x4, x5, [c0,#8*4] adcs x22, x22, x25 // carry is used in the final step stp x6, x22, [c0,#8*6] ret .size __smul_512x63_tail,.-__smul_512x63_tail .type __smul_256_n_shift_by_31, %function .align 5 __smul_256_n_shift_by_31: ldp x4, x5, [c1,#8*0+0] // load |a| (or |b|) asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) ldp x6, x7, [c1,#8*2+0] eor x25, x12, x24 // conditionally negate |f0| (or |g0|) eor x4, x4, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x5, x5, x24 adds x4, x4, x24, lsr#63 eor x6, x6, x24 adcs x5, x5, xzr eor x7, x7, x24 umulh x19, x4, x25 adcs x6, x6, xzr umulh x20, x5, x25 adc x7, x7, xzr umulh x21, x6, x25 and x24, x24, x25 umulh x22, x7, x25 neg x24, x24 mul x4, x4, x25 mul x5, x5, x25 mul x6, x6, x25 adds x5, x5, x19 mul x7, x7, x25 adcs x6, x6, x20 adcs x7, x7, x21 adc x22, x22, x24 ldp x8, x9, [c1,#8*0+32] // load |a| (or |b|) asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) ldp x10, x11, [c1,#8*2+32] eor x25, x13, x24 // conditionally negate |f0| (or |g0|) eor x8, x8, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x9, x9, x24 adds x8, x8, x24, lsr#63 eor x10, x10, x24 adcs x9, x9, xzr eor x11, x11, x24 umulh x19, x8, x25 adcs x10, x10, xzr umulh x20, x9, x25 adc x11, x11, xzr umulh x21, x10, x25 and x24, x24, x25 umulh x23, x11, x25 neg x24, x24 mul x8, x8, x25 mul x9, x9, x25 mul x10, x10, x25 adds x9, x9, x19 mul x11, x11, x25 adcs x10, x10, x20 adcs x11, x11, x21 adc x23, x23, x24 adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 adcs x7, x7, x11 adc x8, x22, x23 extr x4, x5, x4, #31 extr x5, x6, x5, #31 extr x6, x7, x6, #31 asr x23, x8, #63 // result's sign as mask extr x7, x8, x7, #31 eor x4, x4, x23 // ensure the result is positive eor x5, x5, x23 adds x4, x4, x23, lsr#63 eor x6, x6, x23 adcs x5, x5, xzr eor x7, x7, x23 adcs x6, x6, xzr stp x4, x5, [c0,#8*0] adc x7, x7, xzr stp x6, x7, [c0,#8*2] eor x12, x12, x23 // adjust |f/g| accordingly eor x13, x13, x23 sub x12, x12, x23 sub x13, x13, x23 ret .size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 .type __ab_approximation_31_256, %function .align 4 __ab_approximation_31_256: ldp x6, x7, [c1,#8*2] ldp x10, x11, [c1,#8*6] ldp x4, x5, [c1,#8*0] ldp x8, x9, [c1,#8*4] .Lab_approximation_31_256_loaded: orr x19, x7, x11 // check top-most limbs, ... cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x5, ne orr x19, x7, x11 // and ones before top-most, ... csel x10, x10, x9, ne cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x4, ne orr x19, x7, x11 // and one more, ... csel x10, x10, x8, ne clz x19, x19 cmp x19, #64 csel x19, x19, xzr, ne csel x7, x7, x6, ne csel x11, x11, x10, ne neg x20, x19 lslv x7, x7, x19 // align high limbs to the left lslv x11, x11, x19 lsrv x6, x6, x20 lsrv x10, x10, x20 and x6, x6, x20, asr#6 and x10, x10, x20, asr#6 orr x7, x7, x6 orr x11, x11, x10 bfxil x7, x4, #0, #31 bfxil x11, x8, #0, #31 b __inner_loop_31_256 ret .size __ab_approximation_31_256,.-__ab_approximation_31_256 .type __inner_loop_31_256, %function .align 4 __inner_loop_31_256: mov x2, #31 mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x23,#0x7FFFFFFF7FFFFFFF .Loop_31_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x15 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x15, x15, x13, hs // exchange |fg0| and |fg1| csel x13, x13, x19, hs lsr x7, x7, #1 and x19, x15, x22 and x20, x23, x22 sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x15, x15, x15 // |f1|<<=1 add x13, x13, x20 sub x15, x15, x23 cbnz x2, .Loop_31_256 mov x23, #0x7FFFFFFF ubfx x12, x13, #0, #32 ubfx x13, x13, #32, #32 ubfx x14, x15, #0, #32 ubfx x15, x15, #32, #32 sub x12, x12, x23 // remove bias sub x13, x13, x23 sub x14, x14, x23 sub x15, x15, x23 ret .size __inner_loop_31_256,.-__inner_loop_31_256 .type __inner_loop_62_256, %function .align 4 __inner_loop_62_256: mov x12, #1 // |f0|=1 mov x13, #0 // |g0|=0 mov x14, #0 // |f1|=0 mov x15, #1 // |g1|=1 .Loop_62_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x12 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| mov x20, x13 csel x12, x12, x14, hs // exchange |f0| and |f1| csel x14, x14, x19, hs csel x13, x13, x15, hs // exchange |g0| and |g1| csel x15, x15, x20, hs lsr x7, x7, #1 and x19, x14, x22 and x20, x15, x22 add x14, x14, x14 // |f1|<<=1 add x15, x15, x15 // |g1|<<=1 sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, .Loop_62_256 ret .size __inner_loop_62_256,.-__inner_loop_62_256 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/ct_inverse_mod_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl ct_inverse_mod_384 .hidden ct_inverse_mod_384 .type ct_inverse_mod_384, %function .align 5 ct_inverse_mod_384: hint #PACI_HINT stp c29, c30, [csp,#-16*__SIZEOF_POINTER__]! add c29, csp, #0 stp c19, c20, [csp,#2*__SIZEOF_POINTER__] stp c21, c22, [csp,#4*__SIZEOF_POINTER__] stp c23, c24, [csp,#6*__SIZEOF_POINTER__] stp c25, c26, [csp,#8*__SIZEOF_POINTER__] stp c27, c28, [csp,#10*__SIZEOF_POINTER__] sub csp, csp, #1056 ldp x22, x4, [c1,#8*0] ldp x5, x6, [c1,#8*2] ldp x7, x8, [c1,#8*4] #ifdef __CHERI_PURE_CAPABILITY__ add c1,csp,#32+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #32+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif stp c0, c3, [csp] // offload out_ptr, nx_ptr ldp x9, x10, [c2,#8*0] ldp x11, x12, [c2,#8*2] ldp x13, x14, [c2,#8*4] stp x22, x4, [c1,#8*0] // copy input to |a| stp x5, x6, [c1,#8*2] stp x7, x8, [c1,#8*4] stp x9, x10, [c1,#8*6] // copy modulus to |b| stp x11, x12, [c1,#8*8] stp x13, x14, [c1,#8*10] ////////////////////////////////////////// first iteration mov x2, #62 bl .Lab_approximation_62_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 str x15,[c0,#8*12] // initialize |u| with |f0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 str x15, [c0,#8*14] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 ldr x7, [c1,#8*12] // |u| ldr x8, [c1,#8*20] // |v| mul x3, x20, x7 // |u|*|f0| smulh x4, x20, x7 mul x5, x21, x8 // |v|*|g0| smulh x6, x21, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [c0,#8*6] asr x5, x4, #63 // sign extension stp x5, x5, [c0,#8*8] stp x5, x5, [c0,#8*10] mul x3, x15, x7 // |u|*|f1| smulh x4, x15, x7 mul x5, x16, x8 // |v|*|g1| smulh x6, x16, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [c0,#8*14] asr x5, x4, #63 // sign extension stp x5, x5, [c0,#8*16] stp x5, x5, [c0,#8*18] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 asr x27, x27, #63 str x27, [c0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 asr x27, x27, #63 // sign extension stp x27, x27, [c0,#8*6] stp x27, x27, [c0,#8*8] stp x27, x27, [c0,#8*10] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [c0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [c0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [c0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [c0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add c0,c0,#8*6 bl __smul_384_n_shift_by_62 add c0,c0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [c0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add c0,c0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// iteration before last eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 //bl __ab_approximation_62 // |a| and |b| are exact, ldp x3, x8, [c1,#8*0] // just load ldp x9, x14, [c1,#8*6] bl __inner_loop_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif str x3, [c0,#8*0] str x9, [c0,#8*6] mov x20, x15 // exact |f0| mov x21, x16 // exact |g0| mov x15, x17 mov x16, x19 add c0,c0,#8*12 bl __smul_384x63 adc x25, x25, x26 str x25, [c0,#8*6] mov x20, x15 // exact |f1| mov x21, x16 // exact |g1| add c0,c0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// last iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #24 // 768 % 62 //bl __ab_approximation_62 // |a| and |b| are exact, ldr x3, [c1,#8*0] // just load eor x8, x8, x8 ldr x9, [c1,#8*6] eor x14, x14, x14 bl __inner_loop_62 mov x20, x17 mov x21, x19 ldp c0, c15, [csp] // original out_ptr and n_ptr bl __smul_384x63 bl __smul_768x63_tail ldr c30, [c29,#__SIZEOF_POINTER__] smulh x23, x8, x21 // figure out top-most limb adc x26, x26, x28 ldp x9, x10, [c15,#8*0] // load |mod| add x23, x23, x26 // x23 is 1, 0 or -1 ldp x11, x12, [c15,#8*2] asr x22, x23, #63 // sign as mask ldp x13, x14, [c15,#8*4] and x26, x9, x22 // add mod<<384 conditionally and x27, x10, x22 adds x3, x3, x26 and x28, x11, x22 adcs x4, x4, x27 and x2, x12, x22 adcs x5, x5, x28 and x26, x13, x22 adcs x6, x6, x2 and x27, x14, x22 adcs x7, x7, x26 adcs x8, x25, x27 adc x23, x23, xzr // x23 is 1, 0 or -1 neg x22, x23 orr x23, x23, x22 // excess bit or sign as mask asr x22, x22, #63 // excess bit as mask and x9, x9, x23 // mask |mod| and x10, x10, x23 and x11, x11, x23 and x12, x12, x23 and x13, x13, x23 and x14, x14, x23 eor x9, x9, x22 // conditionally negate |mod| eor x10, x10, x22 adds x9, x9, x22, lsr#63 eor x11, x11, x22 adcs x10, x10, xzr eor x12, x12, x22 adcs x11, x11, xzr eor x13, x13, x22 adcs x12, x12, xzr eor x14, x14, x22 adcs x13, x13, xzr adc x14, x14, xzr adds x3, x3, x9 // final adjustment for |mod|<<384 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [c0,#8*6] adcs x7, x7, x13 stp x5, x6, [c0,#8*8] adc x8, x8, x14 stp x7, x8, [c0,#8*10] add csp, csp, #1056 ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] ldp c27, c28, [c29,#10*__SIZEOF_POINTER__] ldr c29, [csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size ct_inverse_mod_384,.-ct_inverse_mod_384 //////////////////////////////////////////////////////////////////////// // see corresponding commentary in ctx_inverse_mod_384-x86_64... .type __smul_384x63, %function .align 5 __smul_384x63: ldp x3, x4, [c1,#8*0+96] // load |u| (or |v|) asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) ldp x5, x6, [c1,#8*2+96] eor x20, x20, x17 // conditionally negate |f_| (or |g_|) ldp x7, x8, [c1,#8*4+96] eor x3, x3, x17 // conditionally negate |u| (or |v|) ldr x25, [c1,#8*6+96] sub x20, x20, x17 eor x4, x4, x17 adds x3, x3, x17, lsr#63 eor x5, x5, x17 adcs x4, x4, xzr eor x6, x6, x17 adcs x5, x5, xzr eor x7, x7, x17 adcs x6, x6, xzr umulh x22, x3, x20 eor x8, x8, x17 umulh x23, x4, x20 adcs x7, x7, xzr umulh x24, x5, x20 eor x25, x25, x17 mul x3, x3, x20 adcs x8, x8, xzr mul x4, x4, x20 adcs x25, x25, xzr cmp x20, #0 mul x5, x5, x20 csel x25, x25, xzr, ne adds x4, x4, x22 umulh x22, x6, x20 adcs x5, x5, x23 umulh x23, x7, x20 mul x6, x6, x20 mul x7, x7, x20 adcs x6, x6, x24 mul x27,x8, x20 adcs x7, x7, x22 adcs x27,x27,x23 adc x2, xzr, xzr ldp x9, x10, [c1,#8*0+160] // load |u| (or |v|) asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) ldp x11, x12, [c1,#8*2+160] eor x21, x21, x17 // conditionally negate |f_| (or |g_|) ldp x13, x14, [c1,#8*4+160] eor x9, x9, x17 // conditionally negate |u| (or |v|) ldr x26, [c1,#8*6+160] sub x21, x21, x17 eor x10, x10, x17 adds x9, x9, x17, lsr#63 eor x11, x11, x17 adcs x10, x10, xzr eor x12, x12, x17 adcs x11, x11, xzr eor x13, x13, x17 adcs x12, x12, xzr umulh x22, x9, x21 eor x14, x14, x17 umulh x23, x10, x21 adcs x13, x13, xzr umulh x24, x11, x21 eor x26, x26, x17 mul x9, x9, x21 adcs x14, x14, xzr mul x10, x10, x21 adcs x26, x26, xzr adc x19, xzr, xzr // used in __smul_768x63_tail cmp x21, #0 mul x11, x11, x21 csel x26, x26, xzr, ne adds x10, x10, x22 umulh x22, x12, x21 adcs x11, x11, x23 umulh x23, x13, x21 mul x12, x12, x21 mul x13, x13, x21 adcs x12, x12, x24 mul x28,x14, x21 adcs x13, x13, x22 adcs x28,x28,x23 adc x2, x2, xzr adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [c0,#8*0] adcs x7, x7, x13 stp x5, x6, [c0,#8*2] adcs x27, x27, x28 stp x7, x27, [c0,#8*4] ret .size __smul_384x63,.-__smul_384x63 .type __smul_768x63_tail, %function .align 5 __smul_768x63_tail: umulh x27, x8, x20 ldr x4, [c1,#8*27]// load rest of |v| adc x2, x2, xzr ldp x5, x6, [c1,#8*28] and x25, x25, x20 ldp x7, x8, [c1,#8*30] sub x27, x27, x25 // tie up |u|*|f1| chain umulh x14, x14, x21 // resume |v|*|g1| chain eor x4, x4, x17 // conditionally negate rest of |v| eor x5, x5, x17 eor x6, x6, x17 adds x4, x4, x19 eor x7, x7, x17 adcs x5, x5, xzr eor x8, x8, x17 adcs x6, x6, xzr umulh x22, x26, x21 adcs x7, x7, xzr umulh x23, x4, x21 adc x8, x8, xzr umulh x24, x5, x21 add x14, x14, x2 umulh x25, x6, x21 asr x28, x27, #63 umulh x2, x7, x21 mul x3, x26, x21 mul x4, x4, x21 mul x5, x5, x21 adds x3, x3, x14 mul x6, x6, x21 adcs x4, x4, x22 mul x7, x7, x21 adcs x5, x5, x23 mul x22, x8, x21 adcs x6, x6, x24 adcs x7, x7, x25 adcs x25, x22, x2 adc x26, xzr, xzr // used in the final step adds x3, x3, x27 adcs x4, x4, x28 adcs x5, x5, x28 adcs x6, x6, x28 stp x3, x4, [c0,#8*6] adcs x7, x7, x28 stp x5, x6, [c0,#8*8] adcs x25, x25, x28 // carry is used in the final step stp x7, x25, [c0,#8*10] ret .size __smul_768x63_tail,.-__smul_768x63_tail .type __smul_384_n_shift_by_62, %function .align 5 __smul_384_n_shift_by_62: ldp x3, x4, [c1,#8*0+0] // load |a| (or |b|) asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) ldp x5, x6, [c1,#8*2+0] eor x2, x15, x28 // conditionally negate |f0| (or |g0|) ldp x7, x8, [c1,#8*4+0] eor x3, x3, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 umulh x22, x3, x2 adcs x6, x6, xzr umulh x23, x4, x2 eor x8, x8, x28 mul x3, x3, x2 adcs x7, x7, xzr mul x4, x4, x2 adc x8, x8, xzr umulh x24, x5, x2 and x28, x28, x2 umulh x25, x6, x2 adds x4, x4, x22 mul x5, x5, x2 umulh x22, x7, x2 neg x28, x28 mul x6, x6, x2 adcs x5, x5, x23 umulh x23, x8, x2 mul x7, x7, x2 adcs x6, x6, x24 mul x8, x8, x2 adcs x7, x7, x25 adcs x8, x8, x22 adc x27, x23, x28 ldp x9, x10, [c1,#8*0+48] // load |a| (or |b|) asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) ldp x11, x12, [c1,#8*2+48] eor x2, x16, x28 // conditionally negate |f0| (or |g0|) ldp x13, x14, [c1,#8*4+48] eor x9, x9, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x10, x10, x28 adds x9, x9, x28, lsr#63 eor x11, x11, x28 adcs x10, x10, xzr eor x12, x12, x28 adcs x11, x11, xzr eor x13, x13, x28 umulh x22, x9, x2 adcs x12, x12, xzr umulh x23, x10, x2 eor x14, x14, x28 mul x9, x9, x2 adcs x13, x13, xzr mul x10, x10, x2 adc x14, x14, xzr umulh x24, x11, x2 and x28, x28, x2 umulh x25, x12, x2 adds x10, x10, x22 mul x11, x11, x2 umulh x22, x13, x2 neg x28, x28 mul x12, x12, x2 adcs x11, x11, x23 umulh x23, x14, x2 mul x13, x13, x2 adcs x12, x12, x24 mul x14, x14, x2 adcs x13, x13, x25 adcs x14, x14, x22 adc x28, x23, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x27, x28 extr x3, x4, x3, #62 extr x4, x5, x4, #62 extr x5, x6, x5, #62 asr x28, x9, #63 extr x6, x7, x6, #62 extr x7, x8, x7, #62 extr x8, x9, x8, #62 eor x3, x3, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 adcs x6, x6, xzr eor x8, x8, x28 stp x3, x4, [c0,#8*0] adcs x7, x7, xzr stp x5, x6, [c0,#8*2] adc x8, x8, xzr stp x7, x8, [c0,#8*4] eor x15, x15, x28 eor x16, x16, x28 sub x15, x15, x28 sub x16, x16, x28 ret .size __smul_384_n_shift_by_62,.-__smul_384_n_shift_by_62 .type __ab_approximation_62, %function .align 4 __ab_approximation_62: ldp x7, x8, [c1,#8*4] ldp x13, x14, [c1,#8*10] ldp x5, x6, [c1,#8*2] ldp x11, x12, [c1,#8*8] .Lab_approximation_62_loaded: orr x22, x8, x14 // check top-most limbs, ... cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x22, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne ldp x3, x4, [c1,#8*0] ldp x9, x10, [c1,#8*6] cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x22, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x22, x8, x14 csel x13, x13, x10, ne clz x22, x22 cmp x22, #64 csel x22, x22, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x23, x22 lslv x8, x8, x22 // align high limbs to the left lslv x14, x14, x22 lsrv x7, x7, x23 lsrv x13, x13, x23 and x7, x7, x23, asr#6 and x13, x13, x23, asr#6 orr x8, x8, x7 orr x14, x14, x13 b __inner_loop_62 ret .size __ab_approximation_62,.-__ab_approximation_62 .type __inner_loop_62, %function .align 4 __inner_loop_62: mov x15, #1 // |f0|=1 mov x16, #0 // |g0|=0 mov x17, #0 // |f1|=0 mov x19, #1 // |g1|=1 .Loop_62: sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 subs x24, x9, x3 // |b_|-|a_| and x22, x9, x28 sbc x25, x14, x8 and x23, x14, x28 subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x22, x15 sbcs x27, x8, x23 mov x23, x16 csel x9, x9, x3, hs // |b_| = |a_| csel x14, x14, x8, hs csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x8, x27, x25, hs csel x15, x15, x17, hs // exchange |f0| and |f1| csel x17, x17, x22, hs csel x16, x16, x19, hs // exchange |g0| and |g1| csel x19, x19, x23, hs extr x3, x8, x3, #1 lsr x8, x8, #1 and x22, x17, x28 and x23, x19, x28 add x17, x17, x17 // |f1|<<=1 add x19, x19, x19 // |g1|<<=1 sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, .Loop_62 ret .size __inner_loop_62,.-__inner_loop_62 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/ct_is_square_mod_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl ct_is_square_mod_384 .hidden ct_is_square_mod_384 .type ct_is_square_mod_384, %function .align 5 ct_is_square_mod_384: hint #PACI_HINT stp c29, c30, [csp,#-16*__SIZEOF_POINTER__]! add c29, csp, #0 stp c19, c20, [csp,#2*__SIZEOF_POINTER__] stp c21, c22, [csp,#4*__SIZEOF_POINTER__] stp c23, c24, [csp,#6*__SIZEOF_POINTER__] stp c25, c26, [csp,#8*__SIZEOF_POINTER__] stp c27, c28, [csp,#10*__SIZEOF_POINTER__] sub csp, csp, #512 ldp x3, x4, [c0,#8*0] // load input ldp x5, x6, [c0,#8*2] ldp x7, x8, [c0,#8*4] add x0, sp, #255 // find closest 256-byte-aligned spot and x0, x0, #-256 // in the frame... #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif ldp x9, x10, [c1,#8*0] // load modulus ldp x11, x12, [c1,#8*2] ldp x13, x14, [c1,#8*4] stp x3, x4, [c0,#8*6] // copy input to |a| stp x5, x6, [c0,#8*8] stp x7, x8, [c0,#8*10] stp x9, x10, [c0,#8*0] // copy modulus to |b| stp x11, x12, [c0,#8*2] stp x13, x14, [c0,#8*4] eor x2, x2, x2 // init the .Legendre symbol mov x15, #24 // 24 is 768/30-1 b .Loop_is_square .align 4 .Loop_is_square: bl __ab_approximation_30 sub x15, x15, #1 eor x1, x0, #128 // pointer to dst |b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,csp,x1 #endif bl __smul_384_n_shift_by_30 mov x19, x16 // |f0| mov x20, x17 // |g0| add c1,c1,#8*6 bl __smul_384_n_shift_by_30 ldp x9, x10, [c1,#-8*6] eor x0, x0, #128 // flip-flop src |a|b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif and x27, x27, x9 // if |a| was negative, add x2, x2, x27, lsr#1 // adjust |L| cbnz x15, .Loop_is_square ////////////////////////////////////////// last iteration //bl __ab_approximation_30 // |a| and |b| are exact, //ldr x8, [x0,#8*6] // and loaded //ldr x14, [x0,#8*0] mov x15, #48 // 48 is 768%30 + 30 bl __inner_loop_48 ldr c30, [c29,#__SIZEOF_POINTER__] and x0, x2, #1 eor x0, x0, #1 add csp, csp, #512 ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] ldp c27, c28, [c29,#10*__SIZEOF_POINTER__] ldr c29, [csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size ct_is_square_mod_384,.-ct_is_square_mod_384 .type __smul_384_n_shift_by_30, %function .align 5 __smul_384_n_shift_by_30: ldp x3, x4, [c0,#8*0+0] // load |b| (or |a|) asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) ldp x5, x6, [c0,#8*2+0] eor x20, x20, x27 // conditionally negate |g1| (or |f1|) ldp x7, x8, [c0,#8*4+0] eor x3, x3, x27 // conditionally negate |b| (or |a|) sub x20, x20, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 umulh x21, x3, x20 adcs x6, x6, xzr umulh x22, x4, x20 eor x8, x8, x27 umulh x23, x5, x20 adcs x7, x7, xzr umulh x24, x6, x20 adc x8, x8, xzr umulh x25, x7, x20 and x28, x20, x27 umulh x26, x8, x20 neg x28, x28 mul x3, x3, x20 mul x4, x4, x20 mul x5, x5, x20 adds x4, x4, x21 mul x6, x6, x20 adcs x5, x5, x22 mul x7, x7, x20 adcs x6, x6, x23 mul x8, x8, x20 adcs x7, x7, x24 adcs x8, x8 ,x25 adc x26, x26, x28 ldp x9, x10, [c0,#8*0+48] // load |b| (or |a|) asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) ldp x11, x12, [c0,#8*2+48] eor x19, x19, x27 // conditionally negate |g1| (or |f1|) ldp x13, x14, [c0,#8*4+48] eor x9, x9, x27 // conditionally negate |b| (or |a|) sub x19, x19, x27 eor x10, x10, x27 adds x9, x9, x27, lsr#63 eor x11, x11, x27 adcs x10, x10, xzr eor x12, x12, x27 adcs x11, x11, xzr eor x13, x13, x27 umulh x21, x9, x19 adcs x12, x12, xzr umulh x22, x10, x19 eor x14, x14, x27 umulh x23, x11, x19 adcs x13, x13, xzr umulh x24, x12, x19 adc x14, x14, xzr umulh x25, x13, x19 and x28, x19, x27 umulh x27, x14, x19 neg x28, x28 mul x9, x9, x19 mul x10, x10, x19 mul x11, x11, x19 adds x10, x10, x21 mul x12, x12, x19 adcs x11, x11, x22 mul x13, x13, x19 adcs x12, x12, x23 mul x14, x14, x19 adcs x13, x13, x24 adcs x14, x14 ,x25 adc x27, x27, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x26, x27 extr x3, x4, x3, #30 extr x4, x5, x4, #30 extr x5, x6, x5, #30 asr x27, x9, #63 extr x6, x7, x6, #30 extr x7, x8, x7, #30 extr x8, x9, x8, #30 eor x3, x3, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 adcs x6, x6, xzr eor x8, x8, x27 stp x3, x4, [c1,#8*0] adcs x7, x7, xzr stp x5, x6, [c1,#8*2] adc x8, x8, xzr stp x7, x8, [c1,#8*4] ret .size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 .type __ab_approximation_30, %function .align 4 __ab_approximation_30: ldp x13, x14, [c0,#8*4] // |a| is still in registers ldp x11, x12, [c0,#8*2] orr x21, x8, x14 // check top-most limbs, ... cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x21, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x21, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x21, x8, x14 // and one more, ... csel x13, x13, x10, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x3, ne orr x21, x8, x14 csel x13, x13, x9, ne clz x21, x21 cmp x21, #64 csel x21, x21, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x22, x21 lslv x8, x8, x21 // align high limbs to the left lslv x14, x14, x21 lsrv x7, x7, x22 lsrv x13, x13, x22 and x7, x7, x22, asr#6 and x13, x13, x22, asr#6 orr x8, x8, x7 orr x14, x14, x13 bfxil x8, x3, #0, #32 bfxil x14, x9, #0, #32 b __inner_loop_30 ret .size __ab_approximation_30,.-__ab_approximation_30 .type __inner_loop_30, %function .align 4 __inner_loop_30: mov x28, #30 mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x27,#0x7FFFFFFF7FFFFFFF .Loop_30: sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x8, x14 sub x28, x28, #1 and x21, x14, x24 sub x22, x14, x8 // |b_|-|a_| subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 mov x21, x20 csel x14, x14, x8, hs // |b_| = |a_| csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x20, x20, x17, hs // exchange |fg0| and |fg1| csel x17, x17, x21, hs csel x2, x2, x25, hs lsr x8, x8, #1 and x21, x20, x24 and x22, x27, x24 add x23, x14, #2 sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x20, x20, x20 // |f1|<<=1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 add x17, x17, x22 sub x20, x20, x27 cbnz x28, .Loop_30 mov x27, #0x7FFFFFFF ubfx x16, x17, #0, #32 ubfx x17, x17, #32, #32 ubfx x19, x20, #0, #32 ubfx x20, x20, #32, #32 sub x16, x16, x27 // remove the bias sub x17, x17, x27 sub x19, x19, x27 sub x20, x20, x27 ret .size __inner_loop_30,.-__inner_loop_30 .type __inner_loop_48, %function .align 4 __inner_loop_48: .Loop_48: sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x3, x9 sub x15, x15, #1 and x21, x9, x24 sub x22, x9, x3 // |b_|-|a_| subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 csel x9, x9, x3, hs // |b_| = |a_| csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x2, x2, x25, hs add x23, x9, #2 lsr x3, x3, #1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 cbnz x15, .Loop_48 ret .size __inner_loop_48,.-__inner_loop_48 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/div3w-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl div_3_limbs .hidden div_3_limbs .type div_3_limbs,%function .align 5 div_3_limbs: hint #34 ldp x4,x5,[c0] // load R eor x0,x0,x0 // Q = 0 mov x3,#64 // loop counter nop .Loop: subs x6,x4,x1 // R - D add x0,x0,x0 // Q <<= 1 sbcs x7,x5,x2 add x0,x0,#1 // Q + speculative bit csel x4,x4,x6,lo // select between R and R - D extr x1,x2,x1,#1 // D >>= 1 csel x5,x5,x7,lo lsr x2,x2,#1 sbc x0,x0,xzr // subtract speculative bit sub x3,x3,#1 cbnz x3,.Loop asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit orr x0,x0,x3 // all ones if overflow ret .size div_3_limbs,.-div_3_limbs .globl quot_rem_128 .hidden quot_rem_128 .type quot_rem_128,%function .align 5 quot_rem_128: hint #34 ldp x3,x4,[c1] mul x5,x3,x2 // divisor[0:1} * quotient umulh x6,x3,x2 mul x11, x4,x2 umulh x7,x4,x2 ldp x8,x9,[c0] // load 3 limbs of the dividend ldr x10,[c0,#16] adds x6,x6,x11 adc x7,x7,xzr subs x8,x8,x5 // dividend - divisor * quotient sbcs x9,x9,x6 sbcs x10,x10,x7 sbc x5,xzr,xzr // borrow -> mask add x2,x2,x5 // if borrowed, adjust the quotient ... and x3,x3,x5 and x4,x4,x5 adds x8,x8,x3 // ... and add divisor adc x9,x9,x4 stp x8,x9,[c0] // save 2 limbs of the remainder str x2,[c0,#16] // and one limb of the quotient mov x0,x2 // return adjusted quotient ret .size quot_rem_128,.-quot_rem_128 .globl quot_rem_64 .hidden quot_rem_64 .type quot_rem_64,%function .align 5 quot_rem_64: hint #34 ldr x3,[c1] ldr x8,[c0] // load 1 limb of the dividend mul x5,x3,x2 // divisor * quotient sub x8,x8,x5 // dividend - divisor * quotient stp x8,x2,[c0] // save remainder and quotient mov x0,x2 // return quotient ret .size quot_rem_64,.-quot_rem_64 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/mul_mont_256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl mul_mont_sparse_256 .hidden mul_mont_sparse_256 .type mul_mont_sparse_256,%function .align 5 mul_mont_sparse_256: hint #34 stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] ldp x10,x11,[c1] ldr x9, [c2] ldp x12,x13,[c1,#16] mul x19,x10,x9 ldp x5,x6,[c3] mul x20,x11,x9 ldp x7,x8,[c3,#16] mul x21,x12,x9 mul x22,x13,x9 umulh x14,x10,x9 umulh x15,x11,x9 mul x3,x4,x19 umulh x16,x12,x9 umulh x17,x13,x9 adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,xzr, x17 mul x17,x8,x3 ldr x9,[c2,8*1] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[c2,8*2] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[c2,8*3] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 adcs x20,x21,x15 adcs x21,x22,x16 adcs x22,x23,x17 adc x23,xzr,xzr subs x14,x19,x5 sbcs x15,x20,x6 sbcs x16,x21,x7 sbcs x17,x22,x8 sbcs xzr, x23,xzr csel x19,x19,x14,lo csel x20,x20,x15,lo csel x21,x21,x16,lo csel x22,x22,x17,lo stp x19,x20,[c0] stp x21,x22,[c0,#16] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldr c29,[csp],#8*__SIZEOF_POINTER__ ret .size mul_mont_sparse_256,.-mul_mont_sparse_256 .globl sqr_mont_sparse_256 .hidden sqr_mont_sparse_256 .type sqr_mont_sparse_256,%function .align 5 sqr_mont_sparse_256: hint #PACI_HINT stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp x5,x6,[c1] ldp x7,x8,[c1,#16] mov x4,x3 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x11,x6,x5 // a[1]*a[0] umulh x15,x6,x5 mul x12,x7,x5 // a[2]*a[0] umulh x16,x7,x5 mul x13,x8,x5 // a[3]*a[0] umulh x19,x8,x5 adds x12,x12,x15 // accumulate high parts of multiplication mul x14,x7,x6 // a[2]*a[1] umulh x15,x7,x6 adcs x13,x13,x16 mul x16,x8,x6 // a[3]*a[1] umulh x17,x8,x6 adc x19,x19,xzr // can't overflow mul x20,x8,x7 // a[3]*a[2] umulh x21,x8,x7 adds x15,x15,x16 // accumulate high parts of multiplication mul x10,x5,x5 // a[0]*a[0] adc x16,x17,xzr // can't overflow adds x13,x13,x14 // accumulate low parts of multiplication umulh x5,x5,x5 adcs x19,x19,x15 mul x15,x6,x6 // a[1]*a[1] adcs x20,x20,x16 umulh x6,x6,x6 adc x21,x21,xzr // can't overflow adds x11,x11,x11 // acc[1-6]*=2 mul x16,x7,x7 // a[2]*a[2] adcs x12,x12,x12 umulh x7,x7,x7 adcs x13,x13,x13 mul x17,x8,x8 // a[3]*a[3] adcs x19,x19,x19 umulh x8,x8,x8 adcs x20,x20,x20 adcs x21,x21,x21 adc x22,xzr,xzr adds x11,x11,x5 // +a[i]*a[i] adcs x12,x12,x15 adcs x13,x13,x6 adcs x19,x19,x16 adcs x20,x20,x7 adcs x21,x21,x17 adc x22,x22,x8 bl __mul_by_1_mont_256 ldr c30,[c29,#__SIZEOF_POINTER__] adds x10,x10,x19 // accumulate upper half adcs x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adc x19,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x19,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[c0] stp x12,x13,[c0,#16] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_sparse_256,.-sqr_mont_sparse_256 .globl from_mont_256 .hidden from_mont_256 .type from_mont_256,%function .align 5 from_mont_256: hint #PACI_HINT stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! add c29,csp,#0 mov x4,x3 ldp x10,x11,[c1] ldp x12,x13,[c1,#16] bl __mul_by_1_mont_256 ldr c30,[c29,#__SIZEOF_POINTER__] subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[c0] stp x12,x13,[c0,#16] ldr c29,[csp],#2*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size from_mont_256,.-from_mont_256 .globl redc_mont_256 .hidden redc_mont_256 .type redc_mont_256,%function .align 5 redc_mont_256: hint #PACI_HINT stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! add c29,csp,#0 mov x4,x3 ldp x10,x11,[c1] ldp x12,x13,[c1,#16] bl __mul_by_1_mont_256 ldr c30,[c29,#__SIZEOF_POINTER__] ldp x14,x15,[c1,#32] ldp x16,x17,[c1,#48] adds x10,x10,x14 adcs x11,x11,x15 adcs x12,x12,x16 adcs x13,x13,x17 adc x9,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x9,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[c0] stp x12,x13,[c0,#16] ldr c29,[csp],#2*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size redc_mont_256,.-redc_mont_256 .type __mul_by_1_mont_256,%function .align 5 __mul_by_1_mont_256: mul x3,x4,x10 ldp x5,x6,[c2] ldp x7,x8,[c2,#16] //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 adc x13,x9,x17 ret .size __mul_by_1_mont_256,.-__mul_by_1_mont_256 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/mul_mont_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl add_mod_384x384 .hidden add_mod_384x384 .type add_mod_384x384,%function .align 5 add_mod_384x384: hint #PACI_HINT stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] ldp x5,x6,[c3] ldp x7,x8,[c3,#16] ldp x9,x10,[c3,#32] bl __add_mod_384x384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldr c29,[csp],#8*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size add_mod_384x384,.-add_mod_384x384 .type __add_mod_384x384,%function .align 5 __add_mod_384x384: ldp x11, x12, [c1] ldp x19,x20,[c2] ldp x13, x14, [c1,#16] adds x11,x11,x19 ldp x21,x22,[c2,#16] adcs x12,x12,x20 ldp x15, x16, [c1,#32] adcs x13,x13,x21 ldp x23,x24,[c2,#32] adcs x14,x14,x22 stp x11, x12, [c0] adcs x15,x15,x23 ldp x11, x12, [c1,#48] adcs x16,x16,x24 ldp x19,x20,[c2,#48] stp x13, x14, [c0,#16] ldp x13, x14, [c1,#64] ldp x21,x22,[c2,#64] adcs x11,x11,x19 stp x15, x16, [c0,#32] adcs x12,x12,x20 ldp x15, x16, [c1,#80] adcs x13,x13,x21 ldp x23,x24,[c2,#80] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo stp x11,x12,[c0,#48] csel x15,x15,x23,lo stp x13,x14,[c0,#64] csel x16,x16,x24,lo stp x15,x16,[c0,#80] ret .size __add_mod_384x384,.-__add_mod_384x384 .globl sub_mod_384x384 .hidden sub_mod_384x384 .type sub_mod_384x384,%function .align 5 sub_mod_384x384: hint #PACI_HINT stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] ldp x5,x6,[c3] ldp x7,x8,[c3,#16] ldp x9,x10,[c3,#32] bl __sub_mod_384x384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldr c29,[csp],#8*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sub_mod_384x384,.-sub_mod_384x384 .type __sub_mod_384x384,%function .align 5 __sub_mod_384x384: ldp x11, x12, [c1] ldp x19,x20,[c2] ldp x13, x14, [c1,#16] subs x11,x11,x19 ldp x21,x22,[c2,#16] sbcs x12,x12,x20 ldp x15, x16, [c1,#32] sbcs x13,x13,x21 ldp x23,x24,[c2,#32] sbcs x14,x14,x22 stp x11, x12, [c0] sbcs x15,x15,x23 ldp x11, x12, [c1,#48] sbcs x16,x16,x24 ldp x19,x20,[c2,#48] stp x13, x14, [c0,#16] ldp x13, x14, [c1,#64] ldp x21,x22,[c2,#64] sbcs x11,x11,x19 stp x15, x16, [c0,#32] sbcs x12,x12,x20 ldp x15, x16, [c1,#80] sbcs x13,x13,x21 ldp x23,x24,[c2,#80] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[c0,#48] adc x16,x16,x24 stp x13,x14,[c0,#64] stp x15,x16,[c0,#80] ret .size __sub_mod_384x384,.-__sub_mod_384x384 .type __add_mod_384,%function .align 5 __add_mod_384: ldp x11, x12, [c1] ldp x19,x20,[c2] ldp x13, x14, [c1,#16] adds x11,x11,x19 ldp x21,x22,[c2,#16] adcs x12,x12,x20 ldp x15, x16, [c1,#32] adcs x13,x13,x21 ldp x23,x24,[c2,#32] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo stp x11,x12,[c0] csel x16,x16,x24,lo stp x13,x14,[c0,#16] stp x15,x16,[c0,#32] ret .size __add_mod_384,.-__add_mod_384 .type __sub_mod_384,%function .align 5 __sub_mod_384: ldp x11, x12, [c1] ldp x19,x20,[c2] ldp x13, x14, [c1,#16] subs x11,x11,x19 ldp x21,x22,[c2,#16] sbcs x12,x12,x20 ldp x15, x16, [c1,#32] sbcs x13,x13,x21 ldp x23,x24,[c2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[c0] adc x16,x16,x24 stp x13,x14,[c0,#16] stp x15,x16,[c0,#32] ret .size __sub_mod_384,.-__sub_mod_384 .globl mul_mont_384x .hidden mul_mont_384x .type mul_mont_384x,%function .align 5 mul_mont_384x: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] sub csp,csp,#288 // space for 3 768-bit vectors mov c26,c0 // save r_ptr mov c27,c1 // save b_ptr mov c28,c2 // save b_ptr add c0,csp,#0 bl __mul_384 add c1,c1,#48 add c2,c2,#48 add c0,csp,#96 bl __mul_384 ldp x5,x6,[c3] ldp x7,x8,[c3,#16] ldp x9,x10,[c3,#32] sub c2,c1,#48 add c0,csp,#240 bl __add_mod_384 add c1,c28,#0 add c2,c28,#48 add c0,csp,#192 bl __add_mod_384 add c1,c0,#0 add c2,c0,#48 bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) ldp x5,x6,[c3] ldp x7,x8,[c3,#16] ldp x9,x10,[c3,#32] mov c1,c0 add c2,csp,#0 bl __sub_mod_384x384 add c2,csp,#96 bl __sub_mod_384x384 // t2 = t2-t0-t1 add c1,csp,#0 add c2,csp,#96 add c0,csp,#0 bl __sub_mod_384x384 // t0 = t0-t1 add c1,csp,#0 add c0,c26,#0 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 add c1,csp,#192 add c0,c0,#48 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] add csp,csp,#288 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_mont_384x,.-mul_mont_384x .globl sqr_mont_384x .hidden sqr_mont_384x .type sqr_mont_384x,%function .align 5 sqr_mont_384x: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c3,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub csp,csp,#96 // space for 2 384-bit vectors mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[c2] ldp x7,x8,[c2,#16] ldp x9,x10,[c2,#32] add c2,c1,#48 add c0,csp,#0 bl __add_mod_384 // t0 = a->re + a->im add c0,csp,#48 bl __sub_mod_384 // t1 = a->re - a->im ldp x11,x12,[c1] ldr x17, [c2] ldp x13,x14,[c1,#16] ldp x15,x16,[c1,#32] bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) adds x11,x11,x11 // add with itself adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x19,x11,x19,lo csel x20,x12,x20,lo csel x21,x13,x21,lo ldp x11,x12,[csp] csel x22,x14,x22,lo ldr x17, [csp,#48] csel x23,x15,x23,lo ldp x13,x14,[csp,#16] csel x24,x16,x24,lo ldp x15,x16,[csp,#32] stp x19,x20,[c2,#48] stp x21,x22,[c2,#64] stp x23,x24,[c2,#80] add c2,csp,#48 bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) ldr c30,[c29,#__SIZEOF_POINTER__] stp x11,x12,[c2] stp x13,x14,[c2,#16] stp x15,x16,[c2,#32] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_384x,.-sqr_mont_384x .globl mul_mont_384 .hidden mul_mont_384 .type mul_mont_384,%function .align 5 mul_mont_384: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c4,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there ldp x11,x12,[c1] ldr x17, [c2] ldp x13,x14,[c1,#16] ldp x15,x16,[c1,#32] ldp x5,x6,[c3] ldp x7,x8,[c3,#16] ldp x9,x10,[c3,#32] bl __mul_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] stp x11,x12,[c2] stp x13,x14,[c2,#16] stp x15,x16,[c2,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_mont_384,.-mul_mont_384 .type __mul_mont_384,%function .align 5 __mul_mont_384: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 mov x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[c2,8*1] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[c2,8*2] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[c2,8*3] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[c2,8*4] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[c2,8*5] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr ldp c4,c2,[c29,#12*__SIZEOF_POINTER__] // pull r_ptr adc x17,x17,xzr adds x19,x20,x26 adcs x20,x21,x27 adcs x21,x22,x28 adcs x22,x23,x0 adcs x23,x24,x1 adcs x24,x25,x3 adc x25,x17,xzr subs x26,x19,x5 sbcs x27,x20,x6 sbcs x28,x21,x7 sbcs x0,x22,x8 sbcs x1,x23,x9 sbcs x3,x24,x10 sbcs xzr, x25,xzr csel x11,x19,x26,lo csel x12,x20,x27,lo csel x13,x21,x28,lo csel x14,x22,x0,lo csel x15,x23,x1,lo csel x16,x24,x3,lo ret .size __mul_mont_384,.-__mul_mont_384 .globl sqr_mont_384 .hidden sqr_mont_384 .type sqr_mont_384,%function .align 5 sqr_mont_384: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] sub csp,csp,#96 // space for 768-bit vector mov c4,c3 // adjust for missing b_ptr mov c3,c0 // save r_ptr mov c0,csp ldp x11,x12,[c1] ldp x13,x14,[c1,#16] ldp x15,x16,[c1,#32] bl __sqr_384 ldp x5,x6,[c2] ldp x7,x8,[c2,#16] ldp x9,x10,[c2,#32] mov c1,csp mov c0,c3 // restore r_ptr bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_384,.-sqr_mont_384 .globl sqr_n_mul_mont_383 .hidden sqr_n_mul_mont_383 .type sqr_n_mul_mont_383,%function .align 5 sqr_n_mul_mont_383: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c4,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub csp,csp,#96 // space for 768-bit vector mov c17,c5 // save b_ptr ldp x11,x12,[c1] ldp x13,x14,[c1,#16] ldp x15,x16,[c1,#32] mov c0,csp .Loop_sqr_383: bl __sqr_384 sub x2,x2,#1 // counter ldp x5,x6,[c3] ldp x7,x8,[c3,#16] ldp x9,x10,[c3,#32] mov c1,csp bl __mul_by_1_mont_384 ldp x19,x20,[c1,#48] ldp x21,x22,[c1,#64] ldp x23,x24,[c1,#80] adds x11,x11,x19 // just accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 cbnz x2,.Loop_sqr_383 mov c2,c17 ldr x17,[c17] bl __mul_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] stp x11,x12,[c2] stp x13,x14,[c2,#16] stp x15,x16,[c2,#32] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 .type __sqr_384,%function .align 5 __sqr_384: mul x19,x12,x11 mul x20,x13,x11 mul x21,x14,x11 mul x22,x15,x11 mul x23,x16,x11 umulh x6,x12,x11 umulh x7,x13,x11 umulh x8,x14,x11 umulh x9,x15,x11 adds x20,x20,x6 umulh x10,x16,x11 adcs x21,x21,x7 mul x7,x13,x12 adcs x22,x22,x8 mul x8,x14,x12 adcs x23,x23,x9 mul x9,x15,x12 adc x24,xzr, x10 mul x10,x16,x12 adds x21,x21,x7 umulh x7,x13,x12 adcs x22,x22,x8 umulh x8,x14,x12 adcs x23,x23,x9 umulh x9,x15,x12 adcs x24,x24,x10 umulh x10,x16,x12 adc x25,xzr,xzr mul x5,x11,x11 adds x22,x22,x7 umulh x11, x11,x11 adcs x23,x23,x8 mul x8,x14,x13 adcs x24,x24,x9 mul x9,x15,x13 adc x25,x25,x10 mul x10,x16,x13 adds x23,x23,x8 umulh x8,x14,x13 adcs x24,x24,x9 umulh x9,x15,x13 adcs x25,x25,x10 umulh x10,x16,x13 adc x26,xzr,xzr mul x6,x12,x12 adds x24,x24,x8 umulh x12, x12,x12 adcs x25,x25,x9 mul x9,x15,x14 adc x26,x26,x10 mul x10,x16,x14 adds x25,x25,x9 umulh x9,x15,x14 adcs x26,x26,x10 umulh x10,x16,x14 adc x27,xzr,xzr mul x7,x13,x13 adds x26,x26,x9 umulh x13, x13,x13 adc x27,x27,x10 mul x8,x14,x14 mul x10,x16,x15 umulh x14, x14,x14 adds x27,x27,x10 umulh x10,x16,x15 mul x9,x15,x15 adc x28,x10,xzr adds x19,x19,x19 adcs x20,x20,x20 adcs x21,x21,x21 adcs x22,x22,x22 adcs x23,x23,x23 adcs x24,x24,x24 adcs x25,x25,x25 adcs x26,x26,x26 umulh x15, x15,x15 adcs x27,x27,x27 mul x10,x16,x16 adcs x28,x28,x28 umulh x16, x16,x16 adc x1,xzr,xzr adds x19,x19,x11 adcs x20,x20,x6 adcs x21,x21,x12 adcs x22,x22,x7 adcs x23,x23,x13 adcs x24,x24,x8 adcs x25,x25,x14 stp x5,x19,[c0] adcs x26,x26,x9 stp x20,x21,[c0,#16] adcs x27,x27,x15 stp x22,x23,[c0,#32] adcs x28,x28,x10 stp x24,x25,[c0,#48] adc x16,x16,x1 stp x26,x27,[c0,#64] stp x28,x16,[c0,#80] ret .size __sqr_384,.-__sqr_384 .globl sqr_384 .hidden sqr_384 .type sqr_384,%function .align 5 sqr_384: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] ldp x11,x12,[c1] ldp x13,x14,[c1,#16] ldp x15,x16,[c1,#32] bl __sqr_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_384,.-sqr_384 .globl redc_mont_384 .hidden redc_mont_384 .type redc_mont_384,%function .align 5 redc_mont_384: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[c2] ldp x7,x8,[c2,#16] ldp x9,x10,[c2,#32] bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size redc_mont_384,.-redc_mont_384 .globl from_mont_384 .hidden from_mont_384 .type from_mont_384,%function .align 5 from_mont_384: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[c2] ldp x7,x8,[c2,#16] ldp x9,x10,[c2,#32] bl __mul_by_1_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[c0] stp x13,x14,[c0,#16] stp x15,x16,[c0,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size from_mont_384,.-from_mont_384 .type __mul_by_1_mont_384,%function .align 5 __mul_by_1_mont_384: ldp x11,x12,[c1] ldp x13,x14,[c1,#16] mul x26,x4,x11 ldp x15,x16,[c1,#32] // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 ret .size __mul_by_1_mont_384,.-__mul_by_1_mont_384 .type __redc_tail_mont_384,%function .align 5 __redc_tail_mont_384: ldp x19,x20,[c1,#48] ldp x21,x22,[c1,#64] ldp x23,x24,[c1,#80] adds x11,x11,x19 // accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[c0] stp x13,x14,[c0,#16] stp x15,x16,[c0,#32] ret .size __redc_tail_mont_384,.-__redc_tail_mont_384 .globl mul_384 .hidden mul_384 .type mul_384,%function .align 5 mul_384: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] bl __mul_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_384,.-mul_384 .type __mul_384,%function .align 5 __mul_384: ldp x11,x12,[c1] ldr x17, [c2] ldp x13,x14,[c1,#16] ldp x15,x16,[c1,#32] mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 umulh x5,x11,x17 umulh x6,x12,x17 umulh x7,x13,x17 umulh x8,x14,x17 umulh x9,x15,x17 umulh x10,x16,x17 ldr x17,[c2,8*1] str x19,[c0] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,xzr, x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[c2,#8*(1+1)] adc x25,xzr,xzr str x19,[c0,8*1] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[c2,#8*(2+1)] adc x25,xzr,xzr str x19,[c0,8*2] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[c2,#8*(3+1)] adc x25,xzr,xzr str x19,[c0,8*3] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[c2,#8*(4+1)] adc x25,xzr,xzr str x19,[c0,8*4] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 adc x25,xzr,xzr str x19,[c0,8*5] adds x19,x20,x5 adcs x20,x21,x6 adcs x21,x22,x7 adcs x22,x23,x8 adcs x23,x24,x9 adc x24,x25,x10 stp x19,x20,[c0,#48] stp x21,x22,[c0,#64] stp x23,x24,[c0,#80] ret .size __mul_384,.-__mul_384 .globl mul_382x .hidden mul_382x .type mul_382x,%function .align 5 mul_382x: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] sub csp,csp,#96 // space for two 384-bit vectors ldp x11,x12,[c1] mov c26,c0 // save r_ptr ldp x19,x20,[c1,#48] mov c27,c1 // save a_ptr ldp x13,x14,[c1,#16] mov c28,c2 // save b_ptr ldp x21,x22,[c1,#64] ldp x15,x16,[c1,#32] adds x5,x11,x19 // t0 = a->re + a->im ldp x23,x24,[c1,#80] adcs x6,x12,x20 ldp x11,x12,[c2] adcs x7,x13,x21 ldp x19,x20,[c2,#48] adcs x8,x14,x22 ldp x13,x14,[c2,#16] adcs x9,x15,x23 ldp x21,x22,[c2,#64] adc x10,x16,x24 ldp x15,x16,[c2,#32] stp x5,x6,[csp] adds x5,x11,x19 // t1 = b->re + b->im ldp x23,x24,[c2,#80] adcs x6,x12,x20 stp x7,x8,[csp,#16] adcs x7,x13,x21 adcs x8,x14,x22 stp x9,x10,[csp,#32] adcs x9,x15,x23 stp x5,x6,[csp,#48] adc x10,x16,x24 stp x7,x8,[csp,#64] stp x9,x10,[csp,#80] bl __mul_384 // mul_384(ret->re, a->re, b->re) add c1,csp,#0 add c2,csp,#48 add c0,c26,#96 bl __mul_384 add c1,c27,#48 add c2,c28,#48 add c0,csp,#0 bl __mul_384 ldp x5,x6,[c3] ldp x7,x8,[c3,#16] ldp x9,x10,[c3,#32] add c1,c26,#96 add c2,csp,#0 add c0,c26,#96 bl __sub_mod_384x384 add c2,c26,#0 bl __sub_mod_384x384 add c1,c26,#0 add c2,csp,#0 add c0,c26,#0 bl __sub_mod_384x384 ldr c30,[c29,#__SIZEOF_POINTER__] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_382x,.-mul_382x .globl sqr_382x .hidden sqr_382x .type sqr_382x,%function .align 5 sqr_382x: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] ldp x11,x12,[c1] ldp x19,x20,[c1,#48] ldp x13,x14,[c1,#16] adds x5,x11,x19 // t0 = a->re + a->im ldp x21,x22,[c1,#64] adcs x6,x12,x20 ldp x15,x16,[c1,#32] adcs x7,x13,x21 ldp x23,x24,[c1,#80] adcs x8,x14,x22 stp x5,x6,[c0] adcs x9,x15,x23 ldp x5,x6,[c2] adc x10,x16,x24 stp x7,x8,[c0,#16] subs x11,x11,x19 // t1 = a->re - a->im ldp x7,x8,[c2,#16] sbcs x12,x12,x20 stp x9,x10,[c0,#32] sbcs x13,x13,x21 ldp x9,x10,[c2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 adds x11,x11,x19 and x21,x7,x25 adcs x12,x12,x20 and x22,x8,x25 adcs x13,x13,x21 and x23,x9,x25 adcs x14,x14,x22 and x24,x10,x25 adcs x15,x15,x23 stp x11,x12,[c0,#48] adc x16,x16,x24 stp x13,x14,[c0,#64] stp x15,x16,[c0,#80] mov c4,c1 // save a_ptr add c1,c0,#0 add c2,c0,#48 bl __mul_384 add c1,c4,#0 add c2,c4,#48 add c0,c0,#96 bl __mul_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp x11,x12,[c0] ldp x13,x14,[c0,#16] adds x11,x11,x11 // add with itself ldp x15,x16,[c0,#32] adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adcs x19,x19,x19 adcs x20,x20,x20 stp x11,x12,[c0] adcs x21,x21,x21 stp x13,x14,[c0,#16] adcs x22,x22,x22 stp x15,x16,[c0,#32] adcs x23,x23,x23 stp x19,x20,[c0,#48] adc x24,x24,x24 stp x21,x22,[c0,#64] stp x23,x24,[c0,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_382x,.-sqr_382x .globl sqr_mont_382x .hidden sqr_mont_382x .type sqr_mont_382x,%function .align 5 sqr_mont_382x: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c3,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub csp,csp,#112 // space for two 384-bit vectors + word mov x4,x3 // adjust for missing b_ptr ldp x11,x12,[c1] ldp x13,x14,[c1,#16] ldp x15,x16,[c1,#32] ldp x17,x20,[c1,#48] ldp x21,x22,[c1,#64] ldp x23,x24,[c1,#80] adds x5,x11,x17 // t0 = a->re + a->im adcs x6,x12,x20 adcs x7,x13,x21 adcs x8,x14,x22 adcs x9,x15,x23 adc x10,x16,x24 subs x19,x11,x17 // t1 = a->re - a->im sbcs x20,x12,x20 sbcs x21,x13,x21 sbcs x22,x14,x22 sbcs x23,x15,x23 sbcs x24,x16,x24 sbc x25,xzr,xzr // borrow flag as mask stp x5,x6,[csp] stp x7,x8,[csp,#16] stp x9,x10,[csp,#32] stp x19,x20,[csp,#48] stp x21,x22,[csp,#64] stp x23,x24,[csp,#80] str x25,[csp,#96] ldp x5,x6,[c2] ldp x7,x8,[c2,#16] ldp x9,x10,[c2,#32] add c2,c1,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) adds x19,x11,x11 // add with itself adcs x20,x12,x12 adcs x21,x13,x13 adcs x22,x14,x14 adcs x23,x15,x15 adc x24,x16,x16 stp x19,x20,[c2,#48] stp x21,x22,[c2,#64] stp x23,x24,[c2,#80] ldp x11,x12,[csp] ldr x17,[csp,#48] ldp x13,x14,[csp,#16] ldp x15,x16,[csp,#32] add c2,csp,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) ldr c30,[c29,#__SIZEOF_POINTER__] ldr x25,[csp,#96] // account for sign from a->re - a->im ldp x19,x20,[csp] ldp x21,x22,[csp,#16] ldp x23,x24,[csp,#32] and x19,x19,x25 and x20,x20,x25 and x21,x21,x25 and x22,x22,x25 and x23,x23,x25 and x24,x24,x25 subs x11,x11,x19 sbcs x12,x12,x20 sbcs x13,x13,x21 sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 and x21,x7,x25 and x22,x8,x25 and x23,x9,x25 and x24,x10,x25 adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 stp x11,x12,[c2] stp x13,x14,[c2,#16] stp x15,x16,[c2,#32] add csp,csp,#112 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_382x,.-sqr_mont_382x .type __mul_mont_383_nonred,%function .align 5 __mul_mont_383_nonred: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 ldr x17,[c2,8*1] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[c2,8*2] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[c2,8*3] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[c2,8*4] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[c2,8*5] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[c29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldp c4,c2,[c29,#12*__SIZEOF_POINTER__] // pull r_ptr adds x11,x20,x26 adcs x12,x21,x27 adcs x13,x22,x28 adcs x14,x23,x0 adcs x15,x24,x1 adcs x16,x25,x3 ret .size __mul_mont_383_nonred,.-__mul_mont_383_nonred .globl sgn0_pty_mont_384 .hidden sgn0_pty_mont_384 .type sgn0_pty_mont_384,%function .align 5 sgn0_pty_mont_384: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[c1] ldp x7,x8,[c1,#16] ldp x9,x10,[c1,#32] mov c1,c0 bl __mul_by_1_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] and x0,x11,#1 adds x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sgn0_pty_mont_384,.-sgn0_pty_mont_384 .globl sgn0_pty_mont_384x .hidden sgn0_pty_mont_384x .type sgn0_pty_mont_384x,%function .align 5 sgn0_pty_mont_384x: hint #PACI_HINT stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[c1] ldp x7,x8,[c1,#16] ldp x9,x10,[c1,#32] mov c1,c0 bl __mul_by_1_mont_384 add c1,c1,#48 and x2,x11,#1 orr x3,x11,x12 adds x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 orr x3,x3,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x2,x2,x17 bl __mul_by_1_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] and x0,x11,#1 orr x1,x11,x12 adds x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 orr x1,x1,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/cheri/sha256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif // // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 // // ==================================================================== // Written by Andy Polyakov, @dot-asm, initially for the OpenSSL // project. // ==================================================================== // // sha256_block procedure for ARMv8. // // This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. .comm __blst_platform_cap,4 .text .align 6 .type .LK256,%object .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 .align 2 .globl blst_sha256_block_armv8 .hidden blst_sha256_block_armv8 .type blst_sha256_block_armv8,%function .align 6 blst_sha256_block_armv8: hint #34 .Lv8_entry: stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! add c29,csp,#0 ld1 {v0.4s,v1.4s},[c0] adr c3,.LK256 .Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[c1],#64 sub x2,x2,#1 ld1 {v16.4s},[c3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[c3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[c3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[c3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[c3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[c3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[c3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[c3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[c3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[c3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[c3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[c3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[c3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[c3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ld1 {v16.4s},[c3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ld1 {v17.4s},[c3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,.Loop_hw st1 {v0.4s,v1.4s},[c0] ldr c29,[csp],#2*__SIZEOF_POINTER__ ret .size blst_sha256_block_armv8,.-blst_sha256_block_armv8 .globl blst_sha256_block_data_order .hidden blst_sha256_block_data_order .type blst_sha256_block_data_order,%function .align 4 blst_sha256_block_data_order: hint #34 adrp c16,__blst_platform_cap ldr w16,[c16,#:lo12:__blst_platform_cap] tst w16,#1 b.ne .Lv8_entry stp c29, c30, [csp, #-2*__SIZEOF_POINTER__]! mov c29, csp sub csp,csp,#16*4 adr c16,.LK256 add x2,x1,x2,lsl#6 // len to point at the end of inp ld1 {v0.16b},[c1], #16 ld1 {v1.16b},[c1], #16 ld1 {v2.16b},[c1], #16 ld1 {v3.16b},[c1], #16 ld1 {v4.4s},[c16], #16 ld1 {v5.4s},[c16], #16 ld1 {v6.4s},[c16], #16 ld1 {v7.4s},[c16], #16 rev32 v0.16b,v0.16b // yes, even on rev32 v1.16b,v1.16b // big-endian rev32 v2.16b,v2.16b rev32 v3.16b,v3.16b mov c17,csp add v4.4s,v4.4s,v0.4s add v5.4s,v5.4s,v1.4s add v6.4s,v6.4s,v2.4s st1 {v4.4s,v5.4s},[c17], #32 add v7.4s,v7.4s,v3.4s st1 {v6.4s,v7.4s},[c17] sub c17,c17,#32 ldp w3,w4,[c0] ldp w5,w6,[c0,#8] ldp w7,w8,[c0,#16] ldp w9,w10,[c0,#24] ldr w12,[csp,#0] mov w13,wzr eor w14,w4,w5 mov w15,wzr b .L_00_48 .align 4 .L_00_48: ext v4.16b,v0.16b,v1.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v2.16b,v3.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v3.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v0.4s,v0.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[csp,#4] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v0.4s,v0.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[csp,#8] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v0.4s,v0.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v0.4s,#17 orr w12,w12,w15 ushr v19.4s,v0.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v0.4s,#15 add w8,w8,w12 ushr v17.4s,v0.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v0.4s,#13 ldr w12,[csp,#12] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[c16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v0.4s,v0.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v0.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[csp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[c17], #16 ext v4.16b,v1.16b,v2.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v3.16b,v0.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v0.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v1.4s,v1.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[csp,#20] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v1.4s,v1.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[csp,#24] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v1.4s,v1.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v1.4s,#17 orr w12,w12,w15 ushr v19.4s,v1.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v1.4s,#15 add w4,w4,w12 ushr v17.4s,v1.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v1.4s,#13 ldr w12,[csp,#28] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[c16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v1.4s,v1.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v1.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[csp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[c17], #16 ext v4.16b,v2.16b,v3.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v0.16b,v1.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v1.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v2.4s,v2.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[csp,#36] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v2.4s,v2.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[csp,#40] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v2.4s,v2.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v2.4s,#17 orr w12,w12,w15 ushr v19.4s,v2.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v2.4s,#15 add w8,w8,w12 ushr v17.4s,v2.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v2.4s,#13 ldr w12,[csp,#44] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[c16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v2.4s,v2.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v2.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[csp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[c17], #16 ext v4.16b,v3.16b,v0.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v1.16b,v2.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v2.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v3.4s,v3.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[csp,#52] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v3.4s,v3.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[csp,#56] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v3.4s,v3.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v3.4s,#17 orr w12,w12,w15 ushr v19.4s,v3.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v3.4s,#15 add w4,w4,w12 ushr v17.4s,v3.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v3.4s,#13 ldr w12,[csp,#60] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[c16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v3.4s,v3.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v3.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[c16] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[c17], #16 cmp w12,#0 // check for K256 terminator ldr w12,[csp,#0] sub c17,c17,#64 bne .L_00_48 sub c16,c16,#256 cmp x1,x2 mov x17, #-64 csel x17, x17, xzr, eq add c1,c1,x17 mov c17,csp add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v0.16b},[c1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[c16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v0.16b,v0.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v0.4s add w10,w10,w11 ldr w12,[csp,#4] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[csp,#8] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[csp,#12] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[csp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[c17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v1.16b},[c1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[c16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v1.16b,v1.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v1.4s add w6,w6,w11 ldr w12,[csp,#20] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[csp,#24] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[csp,#28] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[csp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[c17], #16 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v2.16b},[c1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[c16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v2.16b,v2.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v2.4s add w10,w10,w11 ldr w12,[csp,#36] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[csp,#40] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[csp,#44] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[csp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[c17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v3.16b},[c1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[c16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v3.16b,v3.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v3.4s add w6,w6,w11 ldr w12,[csp,#52] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[csp,#56] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[csp,#60] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[c17], #16 add w3,w3,w15 // h+=Sigma0(a) from the past ldp w11,w12,[c0,#0] add w3,w3,w13 // h+=Maj(a,b,c) from the past ldp w13,w14,[c0,#8] add w3,w3,w11 // accumulate add w4,w4,w12 ldp w11,w12,[c0,#16] add w5,w5,w13 add w6,w6,w14 ldp w13,w14,[c0,#24] add w7,w7,w11 add w8,w8,w12 ldr w12,[csp,#0] stp w3,w4,[c0,#0] add w9,w9,w13 mov w13,wzr stp w5,w6,[c0,#8] add w10,w10,w14 stp w7,w8,[c0,#16] eor w14,w4,w5 stp w9,w10,[c0,#24] mov w15,wzr mov c17,csp b.ne .L_00_48 ldr c29,[c29] add csp,csp,#16*4+2*__SIZEOF_POINTER__ ret .size blst_sha256_block_data_order,.-blst_sha256_block_data_order .globl blst_sha256_emit .hidden blst_sha256_emit .type blst_sha256_emit,%function .align 4 blst_sha256_emit: hint #34 ldp x4,x5,[c1] ldp x6,x7,[c1,#16] #ifndef __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 #endif str w4,[c0,#4] lsr x4,x4,#32 str w5,[c0,#12] lsr x5,x5,#32 str w6,[c0,#20] lsr x6,x6,#32 str w7,[c0,#28] lsr x7,x7,#32 str w4,[c0,#0] str w5,[c0,#8] str w6,[c0,#16] str w7,[c0,#24] ret .size blst_sha256_emit,.-blst_sha256_emit .globl blst_sha256_bcopy .hidden blst_sha256_bcopy .type blst_sha256_bcopy,%function .align 4 blst_sha256_bcopy: hint #34 .Loop_bcopy: ldrb w3,[c1],#1 sub x2,x2,#1 strb w3,[c0],#1 cbnz x2,.Loop_bcopy ret .size blst_sha256_bcopy,.-blst_sha256_bcopy .globl blst_sha256_hcopy .hidden blst_sha256_hcopy .type blst_sha256_hcopy,%function .align 4 blst_sha256_hcopy: hint #34 ldp x4,x5,[c1] ldp x6,x7,[c1,#16] stp x4,x5,[c0] stp x6,x7,[c0,#16] ret .size blst_sha256_hcopy,.-blst_sha256_hcopy #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/coff/add_mod_256-armv8.S ================================================ .text .globl add_mod_256 .def add_mod_256; .type 32; .endef .p2align 5 add_mod_256: hint #34 ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] adds x8,x8,x12 ldp x14,x15,[x2,#16] adcs x9,x9,x13 ldp x4,x5,[x3] adcs x10,x10,x14 ldp x6,x7,[x3,#16] adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[x0] csel x11,x11,x2,lo stp x10,x11,[x0,#16] ret .globl mul_by_3_mod_256 .def mul_by_3_mod_256; .type 32; .endef .p2align 5 mul_by_3_mod_256: hint #34 ldp x12,x13,[x1] ldp x14,x15,[x1,#16] adds x8,x12,x12 ldp x4,x5,[x2] adcs x9,x13,x13 ldp x6,x7,[x2,#16] adcs x10,x14,x14 adcs x11,x15,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo adds x8,x8,x12 adcs x9,x9,x13 adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[x0] csel x11,x11,x2,lo stp x10,x11,[x0,#16] ret .globl lshift_mod_256 .def lshift_mod_256; .type 32; .endef .p2align 5 lshift_mod_256: hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] .Loop_lshift_mod_256: adds x8,x8,x8 sub x2,x2,#1 adcs x9,x9,x9 adcs x10,x10,x10 adcs x11,x11,x11 adc x3,xzr,xzr subs x12,x8,x4 sbcs x13,x9,x5 sbcs x14,x10,x6 sbcs x15,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x12,lo csel x9,x9,x13,lo csel x10,x10,x14,lo csel x11,x11,x15,lo cbnz x2,.Loop_lshift_mod_256 stp x8,x9,[x0] stp x10,x11,[x0,#16] ret .globl rshift_mod_256 .def rshift_mod_256; .type 32; .endef .p2align 5 rshift_mod_256: hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] .Loop_rshift: adds x12,x8,x4 sub x2,x2,#1 adcs x13,x9,x5 adcs x14,x10,x6 adcs x15,x11,x7 adc x3,xzr,xzr tst x8,#1 csel x12,x12,x8,ne csel x13,x13,x9,ne csel x14,x14,x10,ne csel x15,x15,x11,ne csel x3,x3,xzr,ne extr x8,x13,x12,#1 extr x9,x14,x13,#1 extr x10,x15,x14,#1 extr x11,x3,x15,#1 cbnz x2,.Loop_rshift stp x8,x9,[x0] stp x10,x11,[x0,#16] ret .globl cneg_mod_256 .def cneg_mod_256; .type 32; .endef .p2align 5 cneg_mod_256: ldp x8,x9,[x1] ldp x4,x5,[x3] ldp x10,x11,[x1,#16] subs x12,x4,x8 ldp x6,x7,[x3,#16] orr x4,x8,x9 sbcs x13,x5,x9 orr x5,x10,x11 sbcs x14,x6,x10 orr x3,x4,x5 sbc x15,x7,x11 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x8,x8,x12,eq csel x9,x9,x13,eq csel x10,x10,x14,eq stp x8,x9,[x0] csel x11,x11,x15,eq stp x10,x11,[x0,#16] ret .globl sub_mod_256 .def sub_mod_256; .type 32; .endef .p2align 5 sub_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] subs x8,x8,x12 ldp x14,x15,[x2,#16] sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 stp x8,x9,[x0] adc x11,x11,x7 stp x10,x11,[x0,#16] ret .globl check_mod_256 .def check_mod_256; .type 32; .endef .p2align 5 check_mod_256: ldp x8,x9,[x0] ldp x10,x11,[x0,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif subs xzr,x8,x4 sbcs xzr,x9,x5 orr x8,x8,x9 sbcs xzr,x10,x6 orr x8,x8,x10 sbcs xzr,x11,x7 orr x8,x8,x11 sbc x1,xzr,xzr cmp x8,#0 mov x0,#1 csel x0,x0,xzr,ne and x0,x0,x1 ret .globl add_n_check_mod_256 .def add_n_check_mod_256; .type 32; .endef .p2align 5 add_n_check_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif adds x8,x8,x12 ldp x4,x5,[x3] adcs x9,x9,x13 ldp x6,x7,[x3,#16] adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret .globl sub_n_check_mod_256 .def sub_n_check_mod_256; .type 32; .endef .p2align 5 sub_n_check_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif subs x8,x8,x12 sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 adc x11,x11,x7 orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret ================================================ FILE: build/coff/add_mod_256-x86_64.s ================================================ .text .globl add_mod_256 .def add_mod_256; .scl 2; .type 32; .endef .p2align 5 add_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx subq $8,%rsp .LSEH_body_add_mod_256: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 .Loaded_a_add_mod_256: addq 0(%rdx),%r8 adcq 8(%rdx),%r9 movq %r8,%rax adcq 16(%rdx),%r10 movq %r9,%rsi adcq 24(%rdx),%r11 sbbq %rdx,%rdx movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%rdx cmovcq %rax,%r8 cmovcq %rsi,%r9 movq %r8,0(%rdi) cmovcq %rbx,%r10 movq %r9,8(%rdi) cmovcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_add_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_add_mod_256: .globl mul_by_3_mod_256 .def mul_by_3_mod_256; .scl 2; .type 32; .endef .p2align 5 mul_by_3_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_3_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx pushq %rbx pushq %r12 .LSEH_body_mul_by_3_mod_256: movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %rsi,%rdx movq 24(%rsi),%r11 call __lshift_mod_256 movq 0(%rsp),%r12 jmp .Loaded_a_add_mod_256 movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_mul_by_3_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_by_3_mod_256: .def __lshift_mod_256; .scl 3; .type 32; .endef .p2align 5 __lshift_mod_256: .byte 0xf3,0x0f,0x1e,0xfa addq %r8,%r8 adcq %r9,%r9 movq %r8,%rax adcq %r10,%r10 movq %r9,%rsi adcq %r11,%r11 sbbq %r12,%r12 movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%r12 cmovcq %rax,%r8 cmovcq %rsi,%r9 cmovcq %rbx,%r10 cmovcq %rbp,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rax lfence jmpq *%rax ud2 #else .byte 0xf3,0xc3 #endif .globl lshift_mod_256 .def lshift_mod_256; .scl 2; .type 32; .endef .p2align 5 lshift_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_lshift_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 .LSEH_body_lshift_mod_256: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 .Loop_lshift_mod_256: call __lshift_mod_256 decl %edx jnz .Loop_lshift_mod_256 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r12 movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_lshift_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_lshift_mod_256: .globl rshift_mod_256 .def rshift_mod_256; .scl 2; .type 32; .endef .p2align 5 rshift_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_rshift_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx subq $8,%rsp .LSEH_body_rshift_mod_256: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rbp movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 .Loop_rshift_mod_256: movq %rbp,%r8 andq $1,%rbp movq 0(%rcx),%rax negq %rbp movq 8(%rcx),%rsi movq 16(%rcx),%rbx andq %rbp,%rax andq %rbp,%rsi andq %rbp,%rbx andq 24(%rcx),%rbp addq %rax,%r8 adcq %rsi,%r9 adcq %rbx,%r10 adcq %rbp,%r11 sbbq %rax,%rax shrq $1,%r8 movq %r9,%rbp shrq $1,%r9 movq %r10,%rbx shrq $1,%r10 movq %r11,%rsi shrq $1,%r11 shlq $63,%rbp shlq $63,%rbx orq %r8,%rbp shlq $63,%rsi orq %rbx,%r9 shlq $63,%rax orq %rsi,%r10 orq %rax,%r11 decl %edx jnz .Loop_rshift_mod_256 movq %rbp,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_rshift_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_rshift_mod_256: .globl cneg_mod_256 .def cneg_mod_256; .scl 2; .type 32; .endef .p2align 5 cneg_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_cneg_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 .LSEH_body_cneg_mod_256: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r12 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %r12,%r8 movq 24(%rsi),%r11 orq %r9,%r12 orq %r10,%r12 orq %r11,%r12 movq $-1,%rbp movq 0(%rcx),%rax cmovnzq %rbp,%r12 movq 8(%rcx),%rsi movq 16(%rcx),%rbx andq %r12,%rax movq 24(%rcx),%rbp andq %r12,%rsi andq %r12,%rbx andq %r12,%rbp subq %r8,%rax sbbq %r9,%rsi sbbq %r10,%rbx sbbq %r11,%rbp orq %rdx,%rdx cmovzq %r8,%rax cmovzq %r9,%rsi movq %rax,0(%rdi) cmovzq %r10,%rbx movq %rsi,8(%rdi) cmovzq %r11,%rbp movq %rbx,16(%rdi) movq %rbp,24(%rdi) movq 0(%rsp),%r12 movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_cneg_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_cneg_mod_256: .globl sub_mod_256 .def sub_mod_256; .scl 2; .type 32; .endef .p2align 5 sub_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx subq $8,%rsp .LSEH_body_sub_mod_256: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 subq 0(%rdx),%r8 movq 0(%rcx),%rax sbbq 8(%rdx),%r9 movq 8(%rcx),%rsi sbbq 16(%rdx),%r10 movq 16(%rcx),%rbx sbbq 24(%rdx),%r11 movq 24(%rcx),%rbp sbbq %rdx,%rdx andq %rdx,%rax andq %rdx,%rsi andq %rdx,%rbx andq %rdx,%rbp addq %rax,%r8 adcq %rsi,%r9 movq %r8,0(%rdi) adcq %rbx,%r10 movq %r9,8(%rdi) adcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_sub_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sub_mod_256: .globl check_mod_256 .def check_mod_256; .scl 2; .type 32; .endef .p2align 5 check_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_check_mod_256: movq %rcx,%rdi movq %rdx,%rsi #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%rax movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq %rax,%r8 orq %r9,%rax orq %r10,%rax orq %r11,%rax subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq %rsi,%rsi movq $1,%rdx cmpq $0,%rax cmovneq %rdx,%rax andq %rsi,%rax .LSEH_epilogue_check_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_check_mod_256: .globl add_n_check_mod_256 .def add_n_check_mod_256; .scl 2; .type 32; .endef .p2align 5 add_n_check_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_n_check_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx subq $8,%rsp .LSEH_body_add_n_check_mod_256: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 movq %r8,%rax adcq 16(%rdx),%r10 movq %r9,%rsi adcq 24(%rdx),%r11 sbbq %rdx,%rdx movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%rdx cmovcq %rax,%r8 cmovcq %rsi,%r9 movq %r8,0(%rdi) cmovcq %rbx,%r10 movq %r9,8(%rdi) cmovcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) orq %r9,%r8 orq %r11,%r10 orq %r10,%r8 movq $1,%rax cmovzq %r8,%rax movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_add_n_check_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_add_n_check_mod_256: .globl sub_n_check_mod_256 .def sub_n_check_mod_256; .scl 2; .type 32; .endef .p2align 5 sub_n_check_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_n_check_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx subq $8,%rsp .LSEH_body_sub_n_check_mod_256: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 subq 0(%rdx),%r8 movq 0(%rcx),%rax sbbq 8(%rdx),%r9 movq 8(%rcx),%rsi sbbq 16(%rdx),%r10 movq 16(%rcx),%rbx sbbq 24(%rdx),%r11 movq 24(%rcx),%rbp sbbq %rdx,%rdx andq %rdx,%rax andq %rdx,%rsi andq %rdx,%rbx andq %rdx,%rbp addq %rax,%r8 adcq %rsi,%r9 movq %r8,0(%rdi) adcq %rbx,%r10 movq %r9,8(%rdi) adcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) orq %r9,%r8 orq %r11,%r10 orq %r10,%r8 movq $1,%rax cmovzq %r8,%rax movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_sub_n_check_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sub_n_check_mod_256: .section .pdata .p2align 2 .rva .LSEH_begin_add_mod_256 .rva .LSEH_body_add_mod_256 .rva .LSEH_info_add_mod_256_prologue .rva .LSEH_body_add_mod_256 .rva .LSEH_epilogue_add_mod_256 .rva .LSEH_info_add_mod_256_body .rva .LSEH_epilogue_add_mod_256 .rva .LSEH_end_add_mod_256 .rva .LSEH_info_add_mod_256_epilogue .rva .LSEH_begin_mul_by_3_mod_256 .rva .LSEH_body_mul_by_3_mod_256 .rva .LSEH_info_mul_by_3_mod_256_prologue .rva .LSEH_body_mul_by_3_mod_256 .rva .LSEH_epilogue_mul_by_3_mod_256 .rva .LSEH_info_mul_by_3_mod_256_body .rva .LSEH_epilogue_mul_by_3_mod_256 .rva .LSEH_end_mul_by_3_mod_256 .rva .LSEH_info_mul_by_3_mod_256_epilogue .rva .LSEH_begin_lshift_mod_256 .rva .LSEH_body_lshift_mod_256 .rva .LSEH_info_lshift_mod_256_prologue .rva .LSEH_body_lshift_mod_256 .rva .LSEH_epilogue_lshift_mod_256 .rva .LSEH_info_lshift_mod_256_body .rva .LSEH_epilogue_lshift_mod_256 .rva .LSEH_end_lshift_mod_256 .rva .LSEH_info_lshift_mod_256_epilogue .rva .LSEH_begin_rshift_mod_256 .rva .LSEH_body_rshift_mod_256 .rva .LSEH_info_rshift_mod_256_prologue .rva .LSEH_body_rshift_mod_256 .rva .LSEH_epilogue_rshift_mod_256 .rva .LSEH_info_rshift_mod_256_body .rva .LSEH_epilogue_rshift_mod_256 .rva .LSEH_end_rshift_mod_256 .rva .LSEH_info_rshift_mod_256_epilogue .rva .LSEH_begin_cneg_mod_256 .rva .LSEH_body_cneg_mod_256 .rva .LSEH_info_cneg_mod_256_prologue .rva .LSEH_body_cneg_mod_256 .rva .LSEH_epilogue_cneg_mod_256 .rva .LSEH_info_cneg_mod_256_body .rva .LSEH_epilogue_cneg_mod_256 .rva .LSEH_end_cneg_mod_256 .rva .LSEH_info_cneg_mod_256_epilogue .rva .LSEH_begin_sub_mod_256 .rva .LSEH_body_sub_mod_256 .rva .LSEH_info_sub_mod_256_prologue .rva .LSEH_body_sub_mod_256 .rva .LSEH_epilogue_sub_mod_256 .rva .LSEH_info_sub_mod_256_body .rva .LSEH_epilogue_sub_mod_256 .rva .LSEH_end_sub_mod_256 .rva .LSEH_info_sub_mod_256_epilogue .rva .LSEH_epilogue_check_mod_256 .rva .LSEH_end_check_mod_256 .rva .LSEH_info_check_mod_256_epilogue .rva .LSEH_begin_add_n_check_mod_256 .rva .LSEH_body_add_n_check_mod_256 .rva .LSEH_info_add_n_check_mod_256_prologue .rva .LSEH_body_add_n_check_mod_256 .rva .LSEH_epilogue_add_n_check_mod_256 .rva .LSEH_info_add_n_check_mod_256_body .rva .LSEH_epilogue_add_n_check_mod_256 .rva .LSEH_end_add_n_check_mod_256 .rva .LSEH_info_add_n_check_mod_256_epilogue .rva .LSEH_begin_sub_n_check_mod_256 .rva .LSEH_body_sub_n_check_mod_256 .rva .LSEH_info_sub_n_check_mod_256_prologue .rva .LSEH_body_sub_n_check_mod_256 .rva .LSEH_epilogue_sub_n_check_mod_256 .rva .LSEH_info_sub_n_check_mod_256_body .rva .LSEH_epilogue_sub_n_check_mod_256 .rva .LSEH_end_sub_n_check_mod_256 .rva .LSEH_info_sub_n_check_mod_256_epilogue .section .xdata .p2align 3 .LSEH_info_add_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_add_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_by_3_mod_256_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_lshift_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_lshift_mod_256_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .LSEH_info_lshift_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_rshift_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_rshift_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_rshift_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_cneg_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_cneg_mod_256_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .LSEH_info_cneg_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sub_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_check_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_add_n_check_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_add_n_check_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_add_n_check_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_n_check_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sub_n_check_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_n_check_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/add_mod_384-armv8.S ================================================ .text .globl add_mod_384 .def add_mod_384; .type 32; .endef .p2align 5 add_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .def __add_mod_384; .type 32; .endef .p2align 5 __add_mod_384: ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] __add_mod_384_ab_are_loaded: adds x10,x10,x16 adcs x11,x11,x17 adcs x12,x12,x19 adcs x13,x13,x20 adcs x14,x14,x21 adcs x15,x15,x22 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .globl add_mod_384x .def add_mod_384x; .type 32; .endef .p2align 5 add_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl rshift_mod_384 .def rshift_mod_384; .type 32; .endef .p2align 5 rshift_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] .Loop_rshift_mod_384: sub x2,x2,#1 bl __rshift_mod_384 cbnz x2,.Loop_rshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .def __rshift_mod_384; .type 32; .endef .p2align 5 __rshift_mod_384: sbfx x22,x10,#0,#1 and x16,x22,x4 and x17,x22,x5 adds x10,x10,x16 and x19,x22,x6 adcs x11,x11,x17 and x20,x22,x7 adcs x12,x12,x19 and x21,x22,x8 adcs x13,x13,x20 and x22,x22,x9 adcs x14,x14,x21 extr x10,x11,x10,#1 // a[0:5] >>= 1 adcs x15,x15,x22 extr x11,x12,x11,#1 adc x22,xzr,xzr extr x12,x13,x12,#1 extr x13,x14,x13,#1 extr x14,x15,x14,#1 extr x15,x22,x15,#1 ret .globl div_by_2_mod_384 .def div_by_2_mod_384; .type 32; .endef .p2align 5 div_by_2_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __rshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl lshift_mod_384 .def lshift_mod_384; .type 32; .endef .p2align 5 lshift_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] .Loop_lshift_mod_384: sub x2,x2,#1 bl __lshift_mod_384 cbnz x2,.Loop_lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .def __lshift_mod_384; .type 32; .endef .p2align 5 __lshift_mod_384: adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .globl mul_by_3_mod_384 .def mul_by_3_mod_384; .type 32; .endef .p2align 5 mul_by_3_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl mul_by_8_mod_384 .def mul_by_8_mod_384; .type 32; .endef .p2align 5 mul_by_8_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl mul_by_3_mod_384x .def mul_by_3_mod_384x; .type 32; .endef .p2align 5 mul_by_3_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 ldp x16,x17,[x1,#48] ldp x19,x20,[x1,#64] ldp x21,x22,[x1,#80] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl mul_by_8_mod_384x .def mul_by_8_mod_384x; .type 32; .endef .p2align 5 mul_by_8_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl cneg_mod_384 .def cneg_mod_384; .type 32; .endef .p2align 5 cneg_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x4,x5,[x3] ldp x12,x13,[x1,#16] ldp x6,x7,[x3,#16] subs x16,x4,x10 ldp x14,x15,[x1,#32] ldp x8,x9,[x3,#32] orr x3,x10,x11 sbcs x17,x5,x11 orr x3,x3,x12 sbcs x19,x6,x12 orr x3,x3,x13 sbcs x20,x7,x13 orr x3,x3,x14 sbcs x21,x8,x14 orr x3,x3,x15 sbc x22,x9,x15 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x10,x10,x16,eq csel x11,x11,x17,eq csel x12,x12,x19,eq csel x13,x13,x20,eq stp x10,x11,[x0] csel x14,x14,x21,eq stp x12,x13,[x0,#16] csel x15,x15,x22,eq stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl sub_mod_384 .def sub_mod_384; .type 32; .endef .p2align 5 sub_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .def __sub_mod_384; .type 32; .endef .p2align 5 __sub_mod_384: ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] subs x10,x10,x16 sbcs x11,x11,x17 sbcs x12,x12,x19 sbcs x13,x13,x20 sbcs x14,x14,x21 sbcs x15,x15,x22 sbc x3,xzr,xzr and x16,x4,x3 and x17,x5,x3 adds x10,x10,x16 and x19,x6,x3 adcs x11,x11,x17 and x20,x7,x3 adcs x12,x12,x19 and x21,x8,x3 adcs x13,x13,x20 and x22,x9,x3 adcs x14,x14,x21 adc x15,x15,x22 ret .globl sub_mod_384x .def sub_mod_384x; .type 32; .endef .p2align 5 sub_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl mul_by_1_plus_i_mod_384x .def mul_by_1_plus_i_mod_384x; .type 32; .endef .p2align 5 mul_by_1_plus_i_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] add x2,x1,#48 bl __sub_mod_384 // a->re - a->im ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __add_mod_384_ab_are_loaded // a->re + a->im ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl sgn0_pty_mod_384 .def sgn0_pty_mod_384; .type 32; .endef .p2align 5 sgn0_pty_mod_384: hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x0,x10,#1 adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x3,x3,xzr mvn x3,x3 and x3,x3,#2 orr x0,x0,x3 ret .globl sgn0_pty_mod_384x .def sgn0_pty_mod_384x; .type 32; .endef .p2align 5 sgn0_pty_mod_384x: hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x2,x10,#1 orr x3,x10,x11 adds x10,x10,x10 orr x3,x3,x12 adcs x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr ldp x10,x11,[x0,#48] ldp x12,x13,[x0,#64] ldp x14,x15,[x0,#80] mvn x16,x16 and x16,x16,#2 orr x2,x2,x16 and x0,x10,#1 orr x1,x10,x11 adds x10,x10,x10 orr x1,x1,x12 adcs x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr mvn x16,x16 and x16,x16,#2 orr x0,x0,x16 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ret .globl vec_select_32 .def vec_select_32; .type 32; .endef .p2align 5 vec_select_32: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d}, [x1] cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d}, [x2] bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b st1 {v0.2d, v1.2d}, [x0] ret .globl vec_select_48 .def vec_select_48; .type 32; .endef .p2align 5 vec_select_48: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret .globl vec_select_96 .def vec_select_96; .type 32; .endef .p2align 5 vec_select_96: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .globl vec_select_192 .def vec_select_192; .type 32; .endef .p2align 5 vec_select_192: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .globl vec_select_144 .def vec_select_144; .type 32; .endef .p2align 5 vec_select_144: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret .globl vec_select_288 .def vec_select_288; .type 32; .endef .p2align 5 vec_select_288: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .globl vec_prefetch .def vec_prefetch; .type 32; .endef .p2align 5 vec_prefetch: hint #34 add x1, x1, x0 sub x1, x1, #1 mov x2, #64 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi prfm pldl1keep, [x0] ret .globl vec_is_zero_16x .def vec_is_zero_16x; .type 32; .endef .p2align 5 vec_is_zero_16x: hint #34 ld1 {v0.2d}, [x0], #16 lsr x1, x1, #4 sub x1, x1, #1 cbz x1, .Loop_is_zero_done .Loop_is_zero: ld1 {v1.2d}, [x0], #16 orr v0.16b, v0.16b, v1.16b sub x1, x1, #1 cbnz x1, .Loop_is_zero .Loop_is_zero_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .globl vec_is_equal_16x .def vec_is_equal_16x; .type 32; .endef .p2align 5 vec_is_equal_16x: hint #34 ld1 {v0.2d}, [x0], #16 ld1 {v1.2d}, [x1], #16 lsr x2, x2, #4 eor v0.16b, v0.16b, v1.16b .Loop_is_equal: sub x2, x2, #1 cbz x2, .Loop_is_equal_done ld1 {v1.2d}, [x0], #16 ld1 {v2.2d}, [x1], #16 eor v1.16b, v1.16b, v2.16b orr v0.16b, v0.16b, v1.16b b .Loop_is_equal nop .Loop_is_equal_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret ================================================ FILE: build/coff/add_mod_384-x86_64.s ================================================ .text .globl add_mod_384 .def add_mod_384; .scl 2; .type 32; .endef .p2align 5 add_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_add_mod_384: call __add_mod_384 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_add_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_add_mod_384: .def __add_mod_384; .scl 3; .type 32; .endef .p2align 5 __add_mod_384: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __add_mod_384_a_is_loaded: addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl add_mod_384x .def add_mod_384x; .scl 2; .type 32; .endef .p2align 5 add_mod_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_384x: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $24,%rsp .LSEH_body_add_mod_384x: movq %rsi,0(%rsp) movq %rdx,8(%rsp) leaq 48(%rsi),%rsi leaq 48(%rdx),%rdx leaq 48(%rdi),%rdi call __add_mod_384 movq 0(%rsp),%rsi movq 8(%rsp),%rdx leaq -48(%rdi),%rdi call __add_mod_384 movq 24+0(%rsp),%r15 movq 24+8(%rsp),%r14 movq 24+16(%rsp),%r13 movq 24+24(%rsp),%r12 movq 24+32(%rsp),%rbx movq 24+40(%rsp),%rbp leaq 24+48(%rsp),%rsp .LSEH_epilogue_add_mod_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_add_mod_384x: .globl rshift_mod_384 .def rshift_mod_384; .scl 2; .type 32; .endef .p2align 5 rshift_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_rshift_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rdi .LSEH_body_rshift_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 .Loop_rshift_mod_384: call __rshift_mod_384 decl %edx jnz .Loop_rshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_rshift_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_rshift_mod_384: .def __rshift_mod_384; .scl 3; .type 32; .endef .p2align 5 __rshift_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rsi movq 0(%rcx),%r14 andq %r8,%rsi movq 8(%rcx),%r15 negq %rsi movq 16(%rcx),%rax andq %rsi,%r14 movq 24(%rcx),%rbx andq %rsi,%r15 movq 32(%rcx),%rbp andq %rsi,%rax andq %rsi,%rbx andq %rsi,%rbp andq 40(%rcx),%rsi addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rax adcq %r11,%rbx adcq %r12,%rbp adcq %r13,%rsi sbbq %r13,%r13 shrq $1,%r14 movq %r15,%r8 shrq $1,%r15 movq %rax,%r9 shrq $1,%rax movq %rbx,%r10 shrq $1,%rbx movq %rbp,%r11 shrq $1,%rbp movq %rsi,%r12 shrq $1,%rsi shlq $63,%r8 shlq $63,%r9 orq %r14,%r8 shlq $63,%r10 orq %r15,%r9 shlq $63,%r11 orq %rax,%r10 shlq $63,%r12 orq %rbx,%r11 shlq $63,%r13 orq %rbp,%r12 orq %rsi,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r14 lfence jmpq *%r14 ud2 #else .byte 0xf3,0xc3 #endif .globl div_by_2_mod_384 .def div_by_2_mod_384; .scl 2; .type 32; .endef .p2align 5 div_by_2_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_div_by_2_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rdi .LSEH_body_div_by_2_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq %rdx,%rcx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 call __rshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_div_by_2_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_div_by_2_mod_384: .globl lshift_mod_384 .def lshift_mod_384; .scl 2; .type 32; .endef .p2align 5 lshift_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_lshift_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rdi .LSEH_body_lshift_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 .Loop_lshift_mod_384: addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 movq %r8,%r14 adcq %r11,%r11 movq %r9,%r15 adcq %r12,%r12 movq %r10,%rax adcq %r13,%r13 movq %r11,%rbx sbbq %rdi,%rdi subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdi movq (%rsp),%rdi cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 cmovcq %rbx,%r11 cmovcq %rbp,%r12 cmovcq %rsi,%r13 decl %edx jnz .Loop_lshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_lshift_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_lshift_mod_384: .def __lshift_mod_384; .scl 3; .type 32; .endef .p2align 5 __lshift_mod_384: .byte 0xf3,0x0f,0x1e,0xfa addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 movq %r8,%r14 adcq %r11,%r11 movq %r9,%r15 adcq %r12,%r12 movq %r10,%rax adcq %r13,%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 cmovcq %rbx,%r11 cmovcq %rbp,%r12 cmovcq %rsi,%r13 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl mul_by_3_mod_384 .def mul_by_3_mod_384; .scl 2; .type 32; .endef .p2align 5 mul_by_3_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_3_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rsi .LSEH_body_mul_by_3_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 movq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_mul_by_3_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_by_3_mod_384: .globl mul_by_8_mod_384 .def mul_by_8_mod_384; .scl 2; .type 32; .endef .p2align 5 mul_by_8_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_8_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_mul_by_8_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_mul_by_8_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_by_8_mod_384: .globl mul_by_3_mod_384x .def mul_by_3_mod_384x; .scl 2; .type 32; .endef .p2align 5 mul_by_3_mod_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_3_mod_384x: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rsi .LSEH_body_mul_by_3_mod_384x: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 movq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq (%rsp),%rsi leaq 48(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rsi),%r8 movq 56(%rsi),%r9 movq 64(%rsi),%r10 movq 72(%rsi),%r11 movq 80(%rsi),%r12 movq 88(%rsi),%r13 call __lshift_mod_384 movq $48,%rdx addq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_mul_by_3_mod_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_by_3_mod_384x: .globl mul_by_8_mod_384x .def mul_by_8_mod_384x; .scl 2; .type 32; .endef .p2align 5 mul_by_8_mod_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_8_mod_384x: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rsi .LSEH_body_mul_by_8_mod_384x: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq (%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48+0(%rsi),%r8 movq 48+8(%rsi),%r9 movq 48+16(%rsi),%r10 movq 48+24(%rsi),%r11 movq 48+32(%rsi),%r12 movq 48+40(%rsi),%r13 call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq %r8,48+0(%rdi) movq %r9,48+8(%rdi) movq %r10,48+16(%rdi) movq %r11,48+24(%rdi) movq %r12,48+32(%rdi) movq %r13,48+40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_mul_by_8_mod_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_by_8_mod_384x: .globl cneg_mod_384 .def cneg_mod_384; .scl 2; .type 32; .endef .p2align 5 cneg_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_cneg_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rdx .LSEH_body_cneg_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %rdx,%r8 movq 24(%rsi),%r11 orq %r9,%rdx movq 32(%rsi),%r12 orq %r10,%rdx movq 40(%rsi),%r13 orq %r11,%rdx movq $-1,%rsi orq %r12,%rdx orq %r13,%rdx movq 0(%rcx),%r14 cmovnzq %rsi,%rdx movq 8(%rcx),%r15 movq 16(%rcx),%rax andq %rdx,%r14 movq 24(%rcx),%rbx andq %rdx,%r15 movq 32(%rcx),%rbp andq %rdx,%rax movq 40(%rcx),%rsi andq %rdx,%rbx movq 0(%rsp),%rcx andq %rdx,%rbp andq %rdx,%rsi subq %r8,%r14 sbbq %r9,%r15 sbbq %r10,%rax sbbq %r11,%rbx sbbq %r12,%rbp sbbq %r13,%rsi orq %rcx,%rcx cmovzq %r8,%r14 cmovzq %r9,%r15 cmovzq %r10,%rax movq %r14,0(%rdi) cmovzq %r11,%rbx movq %r15,8(%rdi) cmovzq %r12,%rbp movq %rax,16(%rdi) cmovzq %r13,%rsi movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rsi,40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_cneg_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_cneg_mod_384: .globl sub_mod_384 .def sub_mod_384; .scl 2; .type 32; .endef .p2align 5 sub_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sub_mod_384: call __sub_mod_384 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sub_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sub_mod_384: .def __sub_mod_384; .scl 3; .type 32; .endef .p2align 5 __sub_mod_384: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sub_mod_384x .def sub_mod_384x; .scl 2; .type 32; .endef .p2align 5 sub_mod_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_384x: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $24,%rsp .LSEH_body_sub_mod_384x: movq %rsi,0(%rsp) movq %rdx,8(%rsp) leaq 48(%rsi),%rsi leaq 48(%rdx),%rdx leaq 48(%rdi),%rdi call __sub_mod_384 movq 0(%rsp),%rsi movq 8(%rsp),%rdx leaq -48(%rdi),%rdi call __sub_mod_384 movq 24+0(%rsp),%r15 movq 24+8(%rsp),%r14 movq 24+16(%rsp),%r13 movq 24+24(%rsp),%r12 movq 24+32(%rsp),%rbx movq 24+40(%rsp),%rbp leaq 24+48(%rsp),%rsp .LSEH_epilogue_sub_mod_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sub_mod_384x: .globl mul_by_1_plus_i_mod_384x .def mul_by_1_plus_i_mod_384x; .scl 2; .type 32; .endef .p2align 5 mul_by_1_plus_i_mod_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_1_plus_i_mod_384x: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $56,%rsp .LSEH_body_mul_by_1_plus_i_mod_384x: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rbx adcq 72(%rsi),%r11 movq %r12,%rcx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 movq %rdi,48(%rsp) sbbq %rdi,%rdi subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rbx sbbq 80(%rsi),%rcx sbbq 88(%rsi),%rbp sbbq %rsi,%rsi movq %r8,0(%rsp) movq 0(%rdx),%r8 movq %r9,8(%rsp) movq 8(%rdx),%r9 movq %r10,16(%rsp) movq 16(%rdx),%r10 movq %r11,24(%rsp) movq 24(%rdx),%r11 movq %r12,32(%rsp) andq %rsi,%r8 movq 32(%rdx),%r12 movq %r13,40(%rsp) andq %rsi,%r9 movq 40(%rdx),%r13 andq %rsi,%r10 andq %rsi,%r11 andq %rsi,%r12 andq %rsi,%r13 movq 48(%rsp),%rsi addq %r8,%r14 movq 0(%rsp),%r8 adcq %r9,%r15 movq 8(%rsp),%r9 adcq %r10,%rax movq 16(%rsp),%r10 adcq %r11,%rbx movq 24(%rsp),%r11 adcq %r12,%rcx movq 32(%rsp),%r12 adcq %r13,%rbp movq 40(%rsp),%r13 movq %r14,0(%rsi) movq %r8,%r14 movq %r15,8(%rsi) movq %rax,16(%rsi) movq %r9,%r15 movq %rbx,24(%rsi) movq %rcx,32(%rsi) movq %r10,%rax movq %rbp,40(%rsi) subq 0(%rdx),%r8 movq %r11,%rbx sbbq 8(%rdx),%r9 sbbq 16(%rdx),%r10 movq %r12,%rcx sbbq 24(%rdx),%r11 sbbq 32(%rdx),%r12 movq %r13,%rbp sbbq 40(%rdx),%r13 sbbq $0,%rdi cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,48(%rsi) cmovcq %rbx,%r11 movq %r9,56(%rsi) cmovcq %rcx,%r12 movq %r10,64(%rsi) cmovcq %rbp,%r13 movq %r11,72(%rsi) movq %r12,80(%rsi) movq %r13,88(%rsi) movq 56+0(%rsp),%r15 movq 56+8(%rsp),%r14 movq 56+16(%rsp),%r13 movq 56+24(%rsp),%r12 movq 56+32(%rsp),%rbx movq 56+40(%rsp),%rbp leaq 56+48(%rsp),%rsp .LSEH_epilogue_mul_by_1_plus_i_mod_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_by_1_plus_i_mod_384x: .globl sgn0_pty_mod_384 .def sgn0_pty_mod_384; .scl 2; .type 32; .endef .p2align 5 sgn0_pty_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mod_384: movq %rcx,%rdi movq %rdx,%rsi .LSEH_body_sgn0_pty_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%rcx movq 40(%rdi),%rdx xorq %rax,%rax movq %r8,%rdi addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rax subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rax notq %rax andq $1,%rdi andq $2,%rax orq %rdi,%rax .LSEH_epilogue_sgn0_pty_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sgn0_pty_mod_384: .globl sgn0_pty_mod_384x .def sgn0_pty_mod_384x; .scl 2; .type 32; .endef .p2align 5 sgn0_pty_mod_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mod_384x: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi pushq %rbx subq $8,%rsp .LSEH_body_sgn0_pty_mod_384x: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rdi),%r8 movq 56(%rdi),%r9 movq 64(%rdi),%r10 movq 72(%rdi),%r11 movq 80(%rdi),%rcx movq 88(%rdi),%rdx movq %r8,%rbx orq %r9,%r8 orq %r10,%r8 orq %r11,%r8 orq %rcx,%r8 orq %rdx,%r8 leaq 0(%rdi),%rax xorq %rdi,%rdi movq %rbx,%rbp addq %rbx,%rbx adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rdi subq 0(%rsi),%rbx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rdi movq %r8,0(%rsp) notq %rdi andq $1,%rbp andq $2,%rdi orq %rbp,%rdi movq 0(%rax),%r8 movq 8(%rax),%r9 movq 16(%rax),%r10 movq 24(%rax),%r11 movq 32(%rax),%rcx movq 40(%rax),%rdx movq %r8,%rbx orq %r9,%r8 orq %r10,%r8 orq %r11,%r8 orq %rcx,%r8 orq %rdx,%r8 xorq %rax,%rax movq %rbx,%rbp addq %rbx,%rbx adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rax subq 0(%rsi),%rbx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rax movq 0(%rsp),%rbx notq %rax testq %r8,%r8 cmovzq %rdi,%rbp testq %rbx,%rbx cmovnzq %rdi,%rax andq $1,%rbp andq $2,%rax orq %rbp,%rax movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_sgn0_pty_mod_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sgn0_pty_mod_384x: .globl vec_select_32 .def vec_select_32; .scl 2; .type 32; .endef .p2align 5 vec_select_32: .byte 0xf3,0x0f,0x1e,0xfa movd %r9d,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdx),%xmm0 leaq 16(%rdx),%rdx pcmpeqd %xmm4,%xmm5 movdqu (%r8),%xmm1 leaq 16(%r8),%r8 pcmpeqd %xmm5,%xmm4 leaq 16(%rcx),%rcx pand %xmm4,%xmm0 movdqu 0+16-16(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-16(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-16(%rcx) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,16-16(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_select_48 .def vec_select_48; .scl 2; .type 32; .endef .p2align 5 vec_select_48: .byte 0xf3,0x0f,0x1e,0xfa movd %r9d,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdx),%xmm0 leaq 24(%rdx),%rdx pcmpeqd %xmm4,%xmm5 movdqu (%r8),%xmm1 leaq 24(%r8),%r8 pcmpeqd %xmm5,%xmm4 leaq 24(%rcx),%rcx pand %xmm4,%xmm0 movdqu 0+16-24(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-24(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-24(%rcx) pand %xmm4,%xmm2 movdqu 16+16-24(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-24(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-24(%rcx) pand %xmm4,%xmm0 pand %xmm5,%xmm1 por %xmm1,%xmm0 movdqu %xmm0,32-24(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_select_96 .def vec_select_96; .scl 2; .type 32; .endef .p2align 5 vec_select_96: .byte 0xf3,0x0f,0x1e,0xfa movd %r9d,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdx),%xmm0 leaq 48(%rdx),%rdx pcmpeqd %xmm4,%xmm5 movdqu (%r8),%xmm1 leaq 48(%r8),%r8 pcmpeqd %xmm5,%xmm4 leaq 48(%rcx),%rcx pand %xmm4,%xmm0 movdqu 0+16-48(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-48(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-48(%rcx) pand %xmm4,%xmm2 movdqu 16+16-48(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-48(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-48(%rcx) pand %xmm4,%xmm0 movdqu 32+16-48(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-48(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-48(%rcx) pand %xmm4,%xmm2 movdqu 48+16-48(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-48(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-48(%rcx) pand %xmm4,%xmm0 movdqu 64+16-48(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-48(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-48(%rcx) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,80-48(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_select_192 .def vec_select_192; .scl 2; .type 32; .endef .p2align 5 vec_select_192: .byte 0xf3,0x0f,0x1e,0xfa movd %r9d,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdx),%xmm0 leaq 96(%rdx),%rdx pcmpeqd %xmm4,%xmm5 movdqu (%r8),%xmm1 leaq 96(%r8),%r8 pcmpeqd %xmm5,%xmm4 leaq 96(%rcx),%rcx pand %xmm4,%xmm0 movdqu 0+16-96(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-96(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-96(%rcx) pand %xmm4,%xmm2 movdqu 16+16-96(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-96(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-96(%rcx) pand %xmm4,%xmm0 movdqu 32+16-96(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-96(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-96(%rcx) pand %xmm4,%xmm2 movdqu 48+16-96(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-96(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-96(%rcx) pand %xmm4,%xmm0 movdqu 64+16-96(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-96(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-96(%rcx) pand %xmm4,%xmm2 movdqu 80+16-96(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-96(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-96(%rcx) pand %xmm4,%xmm0 movdqu 96+16-96(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-96(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-96(%rcx) pand %xmm4,%xmm2 movdqu 112+16-96(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-96(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-96(%rcx) pand %xmm4,%xmm0 movdqu 128+16-96(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 128+16-96(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,128-96(%rcx) pand %xmm4,%xmm2 movdqu 144+16-96(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 144+16-96(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,144-96(%rcx) pand %xmm4,%xmm0 movdqu 160+16-96(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 160+16-96(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,160-96(%rcx) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,176-96(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_select_144 .def vec_select_144; .scl 2; .type 32; .endef .p2align 5 vec_select_144: .byte 0xf3,0x0f,0x1e,0xfa movd %r9d,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdx),%xmm0 leaq 72(%rdx),%rdx pcmpeqd %xmm4,%xmm5 movdqu (%r8),%xmm1 leaq 72(%r8),%r8 pcmpeqd %xmm5,%xmm4 leaq 72(%rcx),%rcx pand %xmm4,%xmm0 movdqu 0+16-72(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-72(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-72(%rcx) pand %xmm4,%xmm2 movdqu 16+16-72(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-72(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-72(%rcx) pand %xmm4,%xmm0 movdqu 32+16-72(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-72(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-72(%rcx) pand %xmm4,%xmm2 movdqu 48+16-72(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-72(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-72(%rcx) pand %xmm4,%xmm0 movdqu 64+16-72(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-72(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-72(%rcx) pand %xmm4,%xmm2 movdqu 80+16-72(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-72(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-72(%rcx) pand %xmm4,%xmm0 movdqu 96+16-72(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-72(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-72(%rcx) pand %xmm4,%xmm2 movdqu 112+16-72(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-72(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-72(%rcx) pand %xmm4,%xmm0 pand %xmm5,%xmm1 por %xmm1,%xmm0 movdqu %xmm0,128-72(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_select_288 .def vec_select_288; .scl 2; .type 32; .endef .p2align 5 vec_select_288: .byte 0xf3,0x0f,0x1e,0xfa movd %r9d,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdx),%xmm0 leaq 144(%rdx),%rdx pcmpeqd %xmm4,%xmm5 movdqu (%r8),%xmm1 leaq 144(%r8),%r8 pcmpeqd %xmm5,%xmm4 leaq 144(%rcx),%rcx pand %xmm4,%xmm0 movdqu 0+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-144(%rcx) pand %xmm4,%xmm2 movdqu 16+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-144(%rcx) pand %xmm4,%xmm0 movdqu 32+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-144(%rcx) pand %xmm4,%xmm2 movdqu 48+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-144(%rcx) pand %xmm4,%xmm0 movdqu 64+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-144(%rcx) pand %xmm4,%xmm2 movdqu 80+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-144(%rcx) pand %xmm4,%xmm0 movdqu 96+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-144(%rcx) pand %xmm4,%xmm2 movdqu 112+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-144(%rcx) pand %xmm4,%xmm0 movdqu 128+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 128+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,128-144(%rcx) pand %xmm4,%xmm2 movdqu 144+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 144+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,144-144(%rcx) pand %xmm4,%xmm0 movdqu 160+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 160+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,160-144(%rcx) pand %xmm4,%xmm2 movdqu 176+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 176+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,176-144(%rcx) pand %xmm4,%xmm0 movdqu 192+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 192+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,192-144(%rcx) pand %xmm4,%xmm2 movdqu 208+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 208+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,208-144(%rcx) pand %xmm4,%xmm0 movdqu 224+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 224+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,224-144(%rcx) pand %xmm4,%xmm2 movdqu 240+16-144(%rdx),%xmm0 pand %xmm5,%xmm3 movdqu 240+16-144(%r8),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,240-144(%rcx) pand %xmm4,%xmm0 movdqu 256+16-144(%rdx),%xmm2 pand %xmm5,%xmm1 movdqu 256+16-144(%r8),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,256-144(%rcx) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,272-144(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_prefetch .def vec_prefetch; .scl 2; .type 32; .endef .p2align 5 vec_prefetch: .byte 0xf3,0x0f,0x1e,0xfa leaq -1(%rcx,%rdx,1),%rdx movq $64,%rax xorq %r8,%r8 #ifdef __SGX_LVI_HARDENING__ lfence #endif prefetchnta (%rcx) leaq (%rcx,%rax,1),%rcx cmpq %rdx,%rcx cmovaq %rdx,%rcx cmovaq %r8,%rax prefetchnta (%rcx) leaq (%rcx,%rax,1),%rcx cmpq %rdx,%rcx cmovaq %rdx,%rcx cmovaq %r8,%rax prefetchnta (%rcx) leaq (%rcx,%rax,1),%rcx cmpq %rdx,%rcx cmovaq %rdx,%rcx cmovaq %r8,%rax prefetchnta (%rcx) leaq (%rcx,%rax,1),%rcx cmpq %rdx,%rcx cmovaq %rdx,%rcx cmovaq %r8,%rax prefetchnta (%rcx) leaq (%rcx,%rax,1),%rcx cmpq %rdx,%rcx cmovaq %rdx,%rcx cmovaq %r8,%rax prefetchnta (%rcx) leaq (%rcx,%rax,1),%rcx cmpq %rdx,%rcx cmovaq %rdx,%rcx prefetchnta (%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_is_zero_16x .def vec_is_zero_16x; .scl 2; .type 32; .endef .p2align 5 vec_is_zero_16x: .byte 0xf3,0x0f,0x1e,0xfa shrl $4,%edx #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rcx),%xmm0 leaq 16(%rcx),%rcx .Loop_is_zero: decl %edx jz .Loop_is_zero_done movdqu (%rcx),%xmm1 leaq 16(%rcx),%rcx por %xmm1,%xmm0 jmp .Loop_is_zero .Loop_is_zero_done: pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 .byte 102,72,15,126,192 incl %edx testq %rax,%rax cmovnzl %edx,%eax xorl $1,%eax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl vec_is_equal_16x .def vec_is_equal_16x; .scl 2; .type 32; .endef .p2align 5 vec_is_equal_16x: .byte 0xf3,0x0f,0x1e,0xfa shrl $4,%r8d #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rcx),%xmm0 movdqu (%rdx),%xmm1 subq %rcx,%rdx leaq 16(%rcx),%rcx pxor %xmm1,%xmm0 .Loop_is_equal: decl %r8d jz .Loop_is_equal_done movdqu (%rcx),%xmm1 movdqu (%rcx,%rdx,1),%xmm2 leaq 16(%rcx),%rcx pxor %xmm2,%xmm1 por %xmm1,%xmm0 jmp .Loop_is_equal .Loop_is_equal_done: pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 .byte 102,72,15,126,192 incl %r8d testq %rax,%rax cmovnzl %r8d,%eax xorl $1,%eax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_add_mod_384 .rva .LSEH_body_add_mod_384 .rva .LSEH_info_add_mod_384_prologue .rva .LSEH_body_add_mod_384 .rva .LSEH_epilogue_add_mod_384 .rva .LSEH_info_add_mod_384_body .rva .LSEH_epilogue_add_mod_384 .rva .LSEH_end_add_mod_384 .rva .LSEH_info_add_mod_384_epilogue .rva .LSEH_begin_add_mod_384x .rva .LSEH_body_add_mod_384x .rva .LSEH_info_add_mod_384x_prologue .rva .LSEH_body_add_mod_384x .rva .LSEH_epilogue_add_mod_384x .rva .LSEH_info_add_mod_384x_body .rva .LSEH_epilogue_add_mod_384x .rva .LSEH_end_add_mod_384x .rva .LSEH_info_add_mod_384x_epilogue .rva .LSEH_begin_rshift_mod_384 .rva .LSEH_body_rshift_mod_384 .rva .LSEH_info_rshift_mod_384_prologue .rva .LSEH_body_rshift_mod_384 .rva .LSEH_epilogue_rshift_mod_384 .rva .LSEH_info_rshift_mod_384_body .rva .LSEH_epilogue_rshift_mod_384 .rva .LSEH_end_rshift_mod_384 .rva .LSEH_info_rshift_mod_384_epilogue .rva .LSEH_begin_div_by_2_mod_384 .rva .LSEH_body_div_by_2_mod_384 .rva .LSEH_info_div_by_2_mod_384_prologue .rva .LSEH_body_div_by_2_mod_384 .rva .LSEH_epilogue_div_by_2_mod_384 .rva .LSEH_info_div_by_2_mod_384_body .rva .LSEH_epilogue_div_by_2_mod_384 .rva .LSEH_end_div_by_2_mod_384 .rva .LSEH_info_div_by_2_mod_384_epilogue .rva .LSEH_begin_lshift_mod_384 .rva .LSEH_body_lshift_mod_384 .rva .LSEH_info_lshift_mod_384_prologue .rva .LSEH_body_lshift_mod_384 .rva .LSEH_epilogue_lshift_mod_384 .rva .LSEH_info_lshift_mod_384_body .rva .LSEH_epilogue_lshift_mod_384 .rva .LSEH_end_lshift_mod_384 .rva .LSEH_info_lshift_mod_384_epilogue .rva .LSEH_begin_mul_by_3_mod_384 .rva .LSEH_body_mul_by_3_mod_384 .rva .LSEH_info_mul_by_3_mod_384_prologue .rva .LSEH_body_mul_by_3_mod_384 .rva .LSEH_epilogue_mul_by_3_mod_384 .rva .LSEH_info_mul_by_3_mod_384_body .rva .LSEH_epilogue_mul_by_3_mod_384 .rva .LSEH_end_mul_by_3_mod_384 .rva .LSEH_info_mul_by_3_mod_384_epilogue .rva .LSEH_begin_mul_by_8_mod_384 .rva .LSEH_body_mul_by_8_mod_384 .rva .LSEH_info_mul_by_8_mod_384_prologue .rva .LSEH_body_mul_by_8_mod_384 .rva .LSEH_epilogue_mul_by_8_mod_384 .rva .LSEH_info_mul_by_8_mod_384_body .rva .LSEH_epilogue_mul_by_8_mod_384 .rva .LSEH_end_mul_by_8_mod_384 .rva .LSEH_info_mul_by_8_mod_384_epilogue .rva .LSEH_begin_mul_by_3_mod_384x .rva .LSEH_body_mul_by_3_mod_384x .rva .LSEH_info_mul_by_3_mod_384x_prologue .rva .LSEH_body_mul_by_3_mod_384x .rva .LSEH_epilogue_mul_by_3_mod_384x .rva .LSEH_info_mul_by_3_mod_384x_body .rva .LSEH_epilogue_mul_by_3_mod_384x .rva .LSEH_end_mul_by_3_mod_384x .rva .LSEH_info_mul_by_3_mod_384x_epilogue .rva .LSEH_begin_mul_by_8_mod_384x .rva .LSEH_body_mul_by_8_mod_384x .rva .LSEH_info_mul_by_8_mod_384x_prologue .rva .LSEH_body_mul_by_8_mod_384x .rva .LSEH_epilogue_mul_by_8_mod_384x .rva .LSEH_info_mul_by_8_mod_384x_body .rva .LSEH_epilogue_mul_by_8_mod_384x .rva .LSEH_end_mul_by_8_mod_384x .rva .LSEH_info_mul_by_8_mod_384x_epilogue .rva .LSEH_begin_cneg_mod_384 .rva .LSEH_body_cneg_mod_384 .rva .LSEH_info_cneg_mod_384_prologue .rva .LSEH_body_cneg_mod_384 .rva .LSEH_epilogue_cneg_mod_384 .rva .LSEH_info_cneg_mod_384_body .rva .LSEH_epilogue_cneg_mod_384 .rva .LSEH_end_cneg_mod_384 .rva .LSEH_info_cneg_mod_384_epilogue .rva .LSEH_begin_sub_mod_384 .rva .LSEH_body_sub_mod_384 .rva .LSEH_info_sub_mod_384_prologue .rva .LSEH_body_sub_mod_384 .rva .LSEH_epilogue_sub_mod_384 .rva .LSEH_info_sub_mod_384_body .rva .LSEH_epilogue_sub_mod_384 .rva .LSEH_end_sub_mod_384 .rva .LSEH_info_sub_mod_384_epilogue .rva .LSEH_begin_sub_mod_384x .rva .LSEH_body_sub_mod_384x .rva .LSEH_info_sub_mod_384x_prologue .rva .LSEH_body_sub_mod_384x .rva .LSEH_epilogue_sub_mod_384x .rva .LSEH_info_sub_mod_384x_body .rva .LSEH_epilogue_sub_mod_384x .rva .LSEH_end_sub_mod_384x .rva .LSEH_info_sub_mod_384x_epilogue .rva .LSEH_begin_mul_by_1_plus_i_mod_384x .rva .LSEH_body_mul_by_1_plus_i_mod_384x .rva .LSEH_info_mul_by_1_plus_i_mod_384x_prologue .rva .LSEH_body_mul_by_1_plus_i_mod_384x .rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x .rva .LSEH_info_mul_by_1_plus_i_mod_384x_body .rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x .rva .LSEH_end_mul_by_1_plus_i_mod_384x .rva .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue .rva .LSEH_begin_sgn0_pty_mod_384 .rva .LSEH_body_sgn0_pty_mod_384 .rva .LSEH_info_sgn0_pty_mod_384_prologue .rva .LSEH_body_sgn0_pty_mod_384 .rva .LSEH_epilogue_sgn0_pty_mod_384 .rva .LSEH_info_sgn0_pty_mod_384_body .rva .LSEH_epilogue_sgn0_pty_mod_384 .rva .LSEH_end_sgn0_pty_mod_384 .rva .LSEH_info_sgn0_pty_mod_384_epilogue .rva .LSEH_begin_sgn0_pty_mod_384x .rva .LSEH_body_sgn0_pty_mod_384x .rva .LSEH_info_sgn0_pty_mod_384x_prologue .rva .LSEH_body_sgn0_pty_mod_384x .rva .LSEH_epilogue_sgn0_pty_mod_384x .rva .LSEH_info_sgn0_pty_mod_384x_body .rva .LSEH_epilogue_sgn0_pty_mod_384x .rva .LSEH_end_sgn0_pty_mod_384x .rva .LSEH_info_sgn0_pty_mod_384x_epilogue .section .xdata .p2align 3 .LSEH_info_add_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_add_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_add_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 .byte 0x00,0xe4,0x04,0x00 .byte 0x00,0xd4,0x05,0x00 .byte 0x00,0xc4,0x06,0x00 .byte 0x00,0x34,0x07,0x00 .byte 0x00,0x54,0x08,0x00 .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_rshift_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_rshift_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_rshift_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_div_by_2_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_div_by_2_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_div_by_2_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_lshift_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_lshift_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_lshift_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_by_3_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_8_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_by_8_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_8_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_by_3_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_8_mod_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_by_8_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_8_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_cneg_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_cneg_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_cneg_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sub_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sub_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 .byte 0x00,0xe4,0x04,0x00 .byte 0x00,0xd4,0x05,0x00 .byte 0x00,0xc4,0x06,0x00 .byte 0x00,0x34,0x07,0x00 .byte 0x00,0x54,0x08,0x00 .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_1_plus_i_mod_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_by_1_plus_i_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x07,0x00 .byte 0x00,0xe4,0x08,0x00 .byte 0x00,0xd4,0x09,0x00 .byte 0x00,0xc4,0x0a,0x00 .byte 0x00,0x34,0x0b,0x00 .byte 0x00,0x54,0x0c,0x00 .byte 0x00,0x74,0x0e,0x00 .byte 0x00,0x64,0x0f,0x00 .byte 0x00,0xc2 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sgn0_pty_mod_384_body: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mod_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sgn0_pty_mod_384x_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/add_mod_384x384-x86_64.s ================================================ .text .globl add_mod_384x384 .def add_mod_384x384; .scl 2; .type 32; .endef .p2align 5 add_mod_384x384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_384x384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_add_mod_384x384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 addq 0(%rdx),%r8 movq 56(%rsi),%r15 adcq 8(%rdx),%r9 movq 64(%rsi),%rax adcq 16(%rdx),%r10 movq 72(%rsi),%rbx adcq 24(%rdx),%r11 movq 80(%rsi),%rbp adcq 32(%rdx),%r12 movq 88(%rsi),%rsi adcq 40(%rdx),%r13 movq %r8,0(%rdi) adcq 48(%rdx),%r14 movq %r9,8(%rdi) adcq 56(%rdx),%r15 movq %r10,16(%rdi) adcq 64(%rdx),%rax movq %r12,32(%rdi) movq %r14,%r8 adcq 72(%rdx),%rbx movq %r11,24(%rdi) movq %r15,%r9 adcq 80(%rdx),%rbp movq %r13,40(%rdi) movq %rax,%r10 adcq 88(%rdx),%rsi movq %rbx,%r11 sbbq %rdx,%rdx subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %rbp,%r12 sbbq 16(%rcx),%rax sbbq 24(%rcx),%rbx sbbq 32(%rcx),%rbp movq %rsi,%r13 sbbq 40(%rcx),%rsi sbbq $0,%rdx cmovcq %r8,%r14 cmovcq %r9,%r15 cmovcq %r10,%rax movq %r14,48(%rdi) cmovcq %r11,%rbx movq %r15,56(%rdi) cmovcq %r12,%rbp movq %rax,64(%rdi) cmovcq %r13,%rsi movq %rbx,72(%rdi) movq %rbp,80(%rdi) movq %rsi,88(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_add_mod_384x384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_add_mod_384x384: .globl sub_mod_384x384 .def sub_mod_384x384; .scl 2; .type 32; .endef .p2align 5 sub_mod_384x384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_384x384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sub_mod_384x384: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sub_mod_384x384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sub_mod_384x384: .section .pdata .p2align 2 .rva .LSEH_begin_add_mod_384x384 .rva .LSEH_body_add_mod_384x384 .rva .LSEH_info_add_mod_384x384_prologue .rva .LSEH_body_add_mod_384x384 .rva .LSEH_epilogue_add_mod_384x384 .rva .LSEH_info_add_mod_384x384_body .rva .LSEH_epilogue_add_mod_384x384 .rva .LSEH_end_add_mod_384x384 .rva .LSEH_info_add_mod_384x384_epilogue .rva .LSEH_begin_sub_mod_384x384 .rva .LSEH_body_sub_mod_384x384 .rva .LSEH_info_sub_mod_384x384_prologue .rva .LSEH_body_sub_mod_384x384 .rva .LSEH_epilogue_sub_mod_384x384 .rva .LSEH_info_sub_mod_384x384_body .rva .LSEH_epilogue_sub_mod_384x384 .rva .LSEH_end_sub_mod_384x384 .rva .LSEH_info_sub_mod_384x384_epilogue .section .xdata .p2align 3 .LSEH_info_add_mod_384x384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_add_mod_384x384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_384x384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384x384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sub_mod_384x384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384x384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/ct_inverse_mod_256-armv8.S ================================================ .text .globl ct_inverse_mod_256 .def ct_inverse_mod_256; .type 32; .endef .p2align 5 ct_inverse_mod_256: hint #25 stp x29, x30, [sp,#-10*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] sub sp, sp, #1040 ldp x4, x5, [x1,#8*0] ldp x6, x7, [x1,#8*2] #ifdef __CHERI_PURE_CAPABILITY__ add x1,sp,#16+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #16+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif str x0, [sp] // offload out_ptr ldp x8, x9, [x2,#8*0] ldp x10, x11, [x2,#8*2] stp x4, x5, [x1,#8*0] // copy input to |a| stp x6, x7, [x1,#8*2] stp x8, x9, [x1,#8*4] // copy modulus to |b| stp x10, x11, [x1,#8*6] ////////////////////////////////////////// first iteration bl .Lab_approximation_31_256_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 str x12,[x0,#8*8] // initialize |u| with |f0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 str x12, [x0,#8*10] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 ldr x8, [x1,#8*8] // |u| ldr x9, [x1,#8*14] // |v| madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| asr x5, x4, #63 // sign extension stp x4, x5, [x0,#8*4] stp x5, x5, [x0,#8*6] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| asr x5, x4, #63 // sign extension stp x4, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 asr x24, x24, #63 str x24, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 asr x24, x24, #63 // sign extension stp x24, x24, [x0,#8*4] stp x24, x24, [x0,#8*6] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail ////////////////////////////////////////// two[!] last iterations eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #47 // 31 + 512 % 31 //bl __ab_approximation_62_256 // |a| and |b| are exact, ldr x7, [x1,#8*0] // just load ldr x11, [x1,#8*4] bl __inner_loop_62_256 mov x16, x14 mov x17, x15 ldr x0, [sp] // original out_ptr bl __smul_256x63 bl __smul_512x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x20, x7, x17 // figure out top-most limb ldp x8, x9, [x3,#8*0] adc x23, x23, x25 ldp x10, x11, [x3,#8*2] add x20, x20, x23 // x20 is 1, 0 or -1 asr x19, x20, #63 // sign as mask and x23, x8, x19 // add mod<<256 conditionally and x24, x9, x19 adds x4, x4, x23 and x25, x10, x19 adcs x5, x5, x24 and x26, x11, x19 adcs x6, x6, x25 adcs x7, x22, x26 adc x20, x20, xzr // x20 is 1, 0 or -1 neg x19, x20 orr x20, x20, x19 // excess bit or sign as mask asr x19, x19, #63 // excess bit as mask and x8, x8, x20 // mask |mod| and x9, x9, x20 and x10, x10, x20 and x11, x11, x20 eor x8, x8, x19 // conditionally negate |mod| eor x9, x9, x19 adds x8, x8, x19, lsr#63 eor x10, x10, x19 adcs x9, x9, xzr eor x11, x11, x19 adcs x10, x10, xzr adc x11, x11, xzr adds x4, x4, x8 // final adjustment for |mod|<<256 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*4] adc x7, x7, x11 stp x6, x7, [x0,#8*6] add sp, sp, #1040 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldr x29, [sp],#10*__SIZEOF_POINTER__ hint #29 ret //////////////////////////////////////////////////////////////////////// .def __smul_256x63; .type 32; .endef .p2align 5 __smul_256x63: ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) ldp x6, x7, [x1,#8*2+64] eor x16, x16, x14 // conditionally negate |f_| (or |g_|) ldr x22, [x1,#8*4+64] eor x4, x4, x14 // conditionally negate |u| (or |v|) sub x16, x16, x14 eor x5, x5, x14 adds x4, x4, x14, lsr#63 eor x6, x6, x14 adcs x5, x5, xzr eor x7, x7, x14 adcs x6, x6, xzr eor x22, x22, x14 umulh x19, x4, x16 adcs x7, x7, xzr umulh x20, x5, x16 adcs x22, x22, xzr umulh x21, x6, x16 mul x4, x4, x16 cmp x16, #0 mul x5, x5, x16 csel x22, x22, xzr, ne mul x6, x6, x16 adds x5, x5, x19 mul x24, x7, x16 adcs x6, x6, x20 adcs x24, x24, x21 adc x26, xzr, xzr ldp x8, x9, [x1,#8*0+112] // load |u| (or |v|) asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) ldp x10, x11, [x1,#8*2+112] eor x17, x17, x14 // conditionally negate |f_| (or |g_|) ldr x23, [x1,#8*4+112] eor x8, x8, x14 // conditionally negate |u| (or |v|) sub x17, x17, x14 eor x9, x9, x14 adds x8, x8, x14, lsr#63 eor x10, x10, x14 adcs x9, x9, xzr eor x11, x11, x14 adcs x10, x10, xzr eor x23, x23, x14 umulh x19, x8, x17 adcs x11, x11, xzr umulh x20, x9, x17 adcs x23, x23, xzr umulh x21, x10, x17 adc x15, xzr, xzr // used in __smul_512x63_tail mul x8, x8, x17 cmp x17, #0 mul x9, x9, x17 csel x23, x23, xzr, ne mul x10, x10, x17 adds x9, x9, x19 mul x25, x11, x17 adcs x10, x10, x20 adcs x25, x25, x21 adc x26, x26, xzr adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*0] adcs x24, x24, x25 stp x6, x24, [x0,#8*2] ret .def __smul_512x63_tail; .type 32; .endef .p2align 5 __smul_512x63_tail: umulh x24, x7, x16 ldr x5, [x1,#8*19] // load rest of |v| adc x26, x26, xzr ldp x6, x7, [x1,#8*20] and x22, x22, x16 umulh x11, x11, x17 // resume |v|*|g1| chain sub x24, x24, x22 // tie up |u|*|f1| chain asr x25, x24, #63 eor x5, x5, x14 // conditionally negate rest of |v| eor x6, x6, x14 adds x5, x5, x15 eor x7, x7, x14 adcs x6, x6, xzr umulh x19, x23, x17 adc x7, x7, xzr umulh x20, x5, x17 add x11, x11, x26 umulh x21, x6, x17 mul x4, x23, x17 mul x5, x5, x17 adds x4, x4, x11 mul x6, x6, x17 adcs x5, x5, x19 mul x22, x7, x17 adcs x6, x6, x20 adcs x22, x22, x21 adc x23, xzr, xzr // used in the final step adds x4, x4, x24 adcs x5, x5, x25 adcs x6, x6, x25 stp x4, x5, [x0,#8*4] adcs x22, x22, x25 // carry is used in the final step stp x6, x22, [x0,#8*6] ret .def __smul_256_n_shift_by_31; .type 32; .endef .p2align 5 __smul_256_n_shift_by_31: ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) ldp x6, x7, [x1,#8*2+0] eor x25, x12, x24 // conditionally negate |f0| (or |g0|) eor x4, x4, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x5, x5, x24 adds x4, x4, x24, lsr#63 eor x6, x6, x24 adcs x5, x5, xzr eor x7, x7, x24 umulh x19, x4, x25 adcs x6, x6, xzr umulh x20, x5, x25 adc x7, x7, xzr umulh x21, x6, x25 and x24, x24, x25 umulh x22, x7, x25 neg x24, x24 mul x4, x4, x25 mul x5, x5, x25 mul x6, x6, x25 adds x5, x5, x19 mul x7, x7, x25 adcs x6, x6, x20 adcs x7, x7, x21 adc x22, x22, x24 ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) ldp x10, x11, [x1,#8*2+32] eor x25, x13, x24 // conditionally negate |f0| (or |g0|) eor x8, x8, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x9, x9, x24 adds x8, x8, x24, lsr#63 eor x10, x10, x24 adcs x9, x9, xzr eor x11, x11, x24 umulh x19, x8, x25 adcs x10, x10, xzr umulh x20, x9, x25 adc x11, x11, xzr umulh x21, x10, x25 and x24, x24, x25 umulh x23, x11, x25 neg x24, x24 mul x8, x8, x25 mul x9, x9, x25 mul x10, x10, x25 adds x9, x9, x19 mul x11, x11, x25 adcs x10, x10, x20 adcs x11, x11, x21 adc x23, x23, x24 adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 adcs x7, x7, x11 adc x8, x22, x23 extr x4, x5, x4, #31 extr x5, x6, x5, #31 extr x6, x7, x6, #31 asr x23, x8, #63 // result's sign as mask extr x7, x8, x7, #31 eor x4, x4, x23 // ensure the result is positive eor x5, x5, x23 adds x4, x4, x23, lsr#63 eor x6, x6, x23 adcs x5, x5, xzr eor x7, x7, x23 adcs x6, x6, xzr stp x4, x5, [x0,#8*0] adc x7, x7, xzr stp x6, x7, [x0,#8*2] eor x12, x12, x23 // adjust |f/g| accordingly eor x13, x13, x23 sub x12, x12, x23 sub x13, x13, x23 ret .def __ab_approximation_31_256; .type 32; .endef .p2align 4 __ab_approximation_31_256: ldp x6, x7, [x1,#8*2] ldp x10, x11, [x1,#8*6] ldp x4, x5, [x1,#8*0] ldp x8, x9, [x1,#8*4] .Lab_approximation_31_256_loaded: orr x19, x7, x11 // check top-most limbs, ... cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x5, ne orr x19, x7, x11 // and ones before top-most, ... csel x10, x10, x9, ne cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x4, ne orr x19, x7, x11 // and one more, ... csel x10, x10, x8, ne clz x19, x19 cmp x19, #64 csel x19, x19, xzr, ne csel x7, x7, x6, ne csel x11, x11, x10, ne neg x20, x19 lslv x7, x7, x19 // align high limbs to the left lslv x11, x11, x19 lsrv x6, x6, x20 lsrv x10, x10, x20 and x6, x6, x20, asr#6 and x10, x10, x20, asr#6 orr x7, x7, x6 orr x11, x11, x10 bfxil x7, x4, #0, #31 bfxil x11, x8, #0, #31 b __inner_loop_31_256 ret .def __inner_loop_31_256; .type 32; .endef .p2align 4 __inner_loop_31_256: mov x2, #31 mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x23,#0x7FFFFFFF7FFFFFFF .Loop_31_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x15 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x15, x15, x13, hs // exchange |fg0| and |fg1| csel x13, x13, x19, hs lsr x7, x7, #1 and x19, x15, x22 and x20, x23, x22 sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x15, x15, x15 // |f1|<<=1 add x13, x13, x20 sub x15, x15, x23 cbnz x2, .Loop_31_256 mov x23, #0x7FFFFFFF ubfx x12, x13, #0, #32 ubfx x13, x13, #32, #32 ubfx x14, x15, #0, #32 ubfx x15, x15, #32, #32 sub x12, x12, x23 // remove bias sub x13, x13, x23 sub x14, x14, x23 sub x15, x15, x23 ret .def __inner_loop_62_256; .type 32; .endef .p2align 4 __inner_loop_62_256: mov x12, #1 // |f0|=1 mov x13, #0 // |g0|=0 mov x14, #0 // |f1|=0 mov x15, #1 // |g1|=1 .Loop_62_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x12 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| mov x20, x13 csel x12, x12, x14, hs // exchange |f0| and |f1| csel x14, x14, x19, hs csel x13, x13, x15, hs // exchange |g0| and |g1| csel x15, x15, x20, hs lsr x7, x7, #1 and x19, x14, x22 and x20, x15, x22 add x14, x14, x14 // |f1|<<=1 add x15, x15, x15 // |g1|<<=1 sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, .Loop_62_256 ret ================================================ FILE: build/coff/ct_inverse_mod_256-x86_64.s ================================================ .text .globl ct_inverse_mod_256 .def ct_inverse_mod_256; .scl 2; .type 32; .endef .p2align 5 ct_inverse_mod_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ct_inverse_mod_256: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $1072,%rsp .LSEH_body_ct_inverse_mod_256: leaq 48+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 0(%rdx),%r12 movq 8(%rdx),%r13 movq 16(%rdx),%r14 movq 24(%rdx),%r15 movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rax,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,64(%rdi) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,72(%rdi) xorq $256,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq 64(%rsi),%r8 movq 104(%rsi),%r12 movq %r8,%r9 imulq 0(%rsp),%r8 movq %r12,%r13 imulq 8(%rsp),%r12 addq %r12,%r8 movq %r8,32(%rdi) sarq $63,%r8 movq %r8,40(%rdi) movq %r8,48(%rdi) movq %r8,56(%rdi) movq %r8,64(%rdi) leaq 64(%rsi),%rsi imulq %rdx,%r9 imulq %rcx,%r13 addq %r13,%r9 movq %r9,72(%rdi) sarq $63,%r9 movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) movq %r9,104(%rdi) xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 sarq $63,%rbp movq %rbp,40(%rdi) movq %rbp,48(%rdi) movq %rbp,56(%rdi) xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $47,%edx movq 0(%rsi),%r8 movq 32(%rsi),%r10 call __inner_loop_62_256 leaq 64(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulq_512x63 adcq %rbp,%rdx movq 40(%rsp),%rsi movq %rdx,%rax sarq $63,%rdx movq %rdx,%r8 movq %rdx,%r9 #ifdef __SGX_LVI_HARDENING__ lfence #endif andq 0(%rsi),%r8 movq %rdx,%r10 andq 8(%rsi),%r9 andq 16(%rsi),%r10 andq 24(%rsi),%rdx addq %r8,%r12 adcq %r9,%r13 adcq %r10,%r14 adcq %rdx,%r15 adcq $0,%rax movq %rax,%rdx negq %rax orq %rax,%rdx sarq $63,%rax movq %rdx,%r8 movq %rdx,%r9 andq 0(%rsi),%r8 movq %rdx,%r10 andq 8(%rsi),%r9 andq 16(%rsi),%r10 andq 24(%rsi),%rdx xorq %rax,%r8 xorq %rcx,%rcx xorq %rax,%r9 subq %rax,%rcx xorq %rax,%r10 xorq %rax,%rdx addq %rcx,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%rdx addq %r8,%r12 adcq %r9,%r13 adcq %r10,%r14 adcq %rdx,%r15 movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 1072(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_ct_inverse_mod_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_ct_inverse_mod_256: .def __smulq_512x63; .scl 3; .type 32; .endef .p2align 5 __smulq_512x63: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%rbp movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%rbp addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%rbp mulq %rbx movq %rax,0(%rdi) movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %r9,8(%rdi) movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %r10,16(%rdi) movq %rdx,%r11 andq %rbx,%rbp negq %rbp mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq %r11,24(%rdi) movq 40(%rsi),%r8 movq 48(%rsi),%r9 movq 56(%rsi),%r10 movq 64(%rsi),%r11 movq 72(%rsi),%r12 movq 80(%rsi),%r13 movq 88(%rsi),%r14 movq 96(%rsi),%r15 movq %rcx,%rdx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rcx addq %rax,%rcx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 mulq %rcx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rcx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rcx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rcx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rcx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rcx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rcx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 imulq %rcx addq %rax,%r15 adcq $0,%rdx movq %rbp,%rbx sarq $63,%rbp addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq %rbx,%r12 adcq %rbp,%r13 adcq %rbp,%r14 adcq %rbp,%r15 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __smulq_256x63; .scl 3; .type 32; .endef .p2align 5 __smulq_256x63: .byte 0xf3,0x0f,0x1e,0xfa movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%rbp movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%rbp addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%rbp mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 andq %rbx,%rbp negq %rbp mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq %rcx,%rdx movq 40+0(%rsi),%r12 movq 40+8(%rsi),%r13 movq 40+16(%rsi),%r14 movq 40+24(%rsi),%r15 movq 40+32(%rsi),%rcx movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 xorq %rdx,%rcx addq %r12,%rax adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rcx mulq %rbx movq %rax,%r12 movq %r13,%rax movq %rdx,%r13 mulq %rbx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rbx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 andq %rbx,%rcx negq %rcx mulq %rbx addq %rax,%r15 adcq %rdx,%rcx addq %r12,%r8 adcq %r13,%r9 adcq %r14,%r10 adcq %r15,%r11 adcq %rcx,%rbp movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %rbp,32(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __smulq_256_n_shift_by_31; .scl 3; .type 32; .endef .p2align 5 __smulq_256_n_shift_by_31: .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,0(%rdi) movq %rcx,8(%rdi) movq %rdx,%rbp movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq %rbp,%rbx sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rbx addq %rax,%rbx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 mulq %rbx movq %rax,%r8 movq %r9,%rax andq %rbx,%rbp negq %rbp movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r14 movq 32+24(%rsi),%r15 movq %rcx,%rbx sarq $63,%rcx xorq %rax,%rax subq %rcx,%rax xorq %rcx,%rbx addq %rax,%rbx xorq %rcx,%r12 xorq %rcx,%r13 xorq %rcx,%r14 xorq %rcx,%r15 addq %r12,%rax adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 mulq %rbx movq %rax,%r12 movq %r13,%rax andq %rbx,%rcx negq %rcx movq %rdx,%r13 mulq %rbx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rbx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rbx addq %rax,%r15 adcq %rdx,%rcx addq %r12,%r8 adcq %r13,%r9 adcq %r14,%r10 adcq %r15,%r11 adcq %rcx,%rbp movq 0(%rdi),%rdx movq 8(%rdi),%rcx shrdq $31,%r9,%r8 shrdq $31,%r10,%r9 shrdq $31,%r11,%r10 shrdq $31,%rbp,%r11 sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) xorq %rbp,%rdx xorq %rbp,%rcx addq %rax,%rdx addq %rax,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __ab_approximation_31_256; .scl 3; .type 32; .endef .p2align 5 __ab_approximation_31_256: .byte 0xf3,0x0f,0x1e,0xfa movq 24(%rsi),%r9 movq 56(%rsi),%r11 movq 16(%rsi),%rbx movq 48(%rsi),%rbp movq 8(%rsi),%r8 movq 40(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 0(%rsi),%r8 cmovzq %r10,%rbp movq 32(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r9 cmovzq %r10,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 movl $0x7FFFFFFF,%eax andq %rax,%r8 andq %rax,%r10 notq %rax andq %rax,%r9 andq %rax,%r11 orq %r9,%r8 orq %r11,%r10 jmp __inner_loop_31_256 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __inner_loop_31_256; .scl 3; .type 32; .endef .p2align 5 __inner_loop_31_256: .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rcx movq $0x800000007FFFFFFF,%r13 movq $0x7FFFFFFF7FFFFFFF,%r15 .Loop_31_256: cmpq %r10,%r8 movq %r8,%rax movq %r10,%rbx movq %rcx,%rbp movq %r13,%r14 cmovbq %r10,%r8 cmovbq %rax,%r10 cmovbq %r13,%rcx cmovbq %rbp,%r13 subq %r10,%r8 subq %r13,%rcx addq %r15,%rcx testq $1,%rax cmovzq %rax,%r8 cmovzq %rbx,%r10 cmovzq %rbp,%rcx cmovzq %r14,%r13 shrq $1,%r8 addq %r13,%r13 subq %r15,%r13 subl $1,%edx jnz .Loop_31_256 shrq $32,%r15 movl %ecx,%edx movl %r13d,%r12d shrq $32,%rcx shrq $32,%r13 subq %r15,%rdx subq %r15,%rcx subq %r15,%r12 subq %r15,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __inner_loop_62_256; .scl 3; .type 32; .endef .p2align 5 __inner_loop_62_256: .byte 0xf3,0x0f,0x1e,0xfa movl %edx,%r15d movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq %rdx,%r13 movq %rdx,%r14 .Loop_62_256: xorq %rax,%rax testq %r14,%r8 movq %r10,%rbx cmovnzq %r10,%rax subq %r8,%rbx movq %r8,%rbp subq %rax,%r8 cmovcq %rbx,%r8 cmovcq %rbp,%r10 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrq $1,%r8 testq %r14,%rbp cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%r15d jnz .Loop_62_256 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_ct_inverse_mod_256 .rva .LSEH_body_ct_inverse_mod_256 .rva .LSEH_info_ct_inverse_mod_256_prologue .rva .LSEH_body_ct_inverse_mod_256 .rva .LSEH_epilogue_ct_inverse_mod_256 .rva .LSEH_info_ct_inverse_mod_256_body .rva .LSEH_epilogue_ct_inverse_mod_256 .rva .LSEH_end_ct_inverse_mod_256 .rva .LSEH_info_ct_inverse_mod_256_epilogue .section .xdata .p2align 3 .LSEH_info_ct_inverse_mod_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_ct_inverse_mod_256_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x86,0x00 .byte 0x00,0xe4,0x87,0x00 .byte 0x00,0xd4,0x88,0x00 .byte 0x00,0xc4,0x89,0x00 .byte 0x00,0x34,0x8a,0x00 .byte 0x00,0x54,0x8b,0x00 .byte 0x00,0x74,0x8d,0x00 .byte 0x00,0x64,0x8e,0x00 .byte 0x00,0x01,0x8c,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_ct_inverse_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/ct_inverse_mod_384-armv8.S ================================================ .text .globl ct_inverse_mod_384 .def ct_inverse_mod_384; .type 32; .endef .p2align 5 ct_inverse_mod_384: hint #25 stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #1056 ldp x22, x4, [x1,#8*0] ldp x5, x6, [x1,#8*2] ldp x7, x8, [x1,#8*4] #ifdef __CHERI_PURE_CAPABILITY__ add x1,sp,#32+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #32+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif stp x0, x3, [sp] // offload out_ptr, nx_ptr ldp x9, x10, [x2,#8*0] ldp x11, x12, [x2,#8*2] ldp x13, x14, [x2,#8*4] stp x22, x4, [x1,#8*0] // copy input to |a| stp x5, x6, [x1,#8*2] stp x7, x8, [x1,#8*4] stp x9, x10, [x1,#8*6] // copy modulus to |b| stp x11, x12, [x1,#8*8] stp x13, x14, [x1,#8*10] ////////////////////////////////////////// first iteration mov x2, #62 bl .Lab_approximation_62_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 str x15,[x0,#8*12] // initialize |u| with |f0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 str x15, [x0,#8*14] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 ldr x7, [x1,#8*12] // |u| ldr x8, [x1,#8*20] // |v| mul x3, x20, x7 // |u|*|f0| smulh x4, x20, x7 mul x5, x21, x8 // |v|*|g0| smulh x6, x21, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] mul x3, x15, x7 // |u|*|f1| smulh x4, x15, x7 mul x5, x16, x8 // |v|*|g1| smulh x6, x16, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*14] asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*16] stp x5, x5, [x0,#8*18] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 asr x27, x27, #63 str x27, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 asr x27, x27, #63 // sign extension stp x27, x27, [x0,#8*6] stp x27, x27, [x0,#8*8] stp x27, x27, [x0,#8*10] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// iteration before last eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 //bl __ab_approximation_62 // |a| and |b| are exact, ldp x3, x8, [x1,#8*0] // just load ldp x9, x14, [x1,#8*6] bl __inner_loop_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif str x3, [x0,#8*0] str x9, [x0,#8*6] mov x20, x15 // exact |f0| mov x21, x16 // exact |g0| mov x15, x17 mov x16, x19 add x0,x0,#8*12 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // exact |f1| mov x21, x16 // exact |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// last iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #24 // 768 % 62 //bl __ab_approximation_62 // |a| and |b| are exact, ldr x3, [x1,#8*0] // just load eor x8, x8, x8 ldr x9, [x1,#8*6] eor x14, x14, x14 bl __inner_loop_62 mov x20, x17 mov x21, x19 ldp x0, x15, [sp] // original out_ptr and n_ptr bl __smul_384x63 bl __smul_768x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x23, x8, x21 // figure out top-most limb adc x26, x26, x28 ldp x9, x10, [x15,#8*0] // load |mod| add x23, x23, x26 // x23 is 1, 0 or -1 ldp x11, x12, [x15,#8*2] asr x22, x23, #63 // sign as mask ldp x13, x14, [x15,#8*4] and x26, x9, x22 // add mod<<384 conditionally and x27, x10, x22 adds x3, x3, x26 and x28, x11, x22 adcs x4, x4, x27 and x2, x12, x22 adcs x5, x5, x28 and x26, x13, x22 adcs x6, x6, x2 and x27, x14, x22 adcs x7, x7, x26 adcs x8, x25, x27 adc x23, x23, xzr // x23 is 1, 0 or -1 neg x22, x23 orr x23, x23, x22 // excess bit or sign as mask asr x22, x22, #63 // excess bit as mask and x9, x9, x23 // mask |mod| and x10, x10, x23 and x11, x11, x23 and x12, x12, x23 and x13, x13, x23 and x14, x14, x23 eor x9, x9, x22 // conditionally negate |mod| eor x10, x10, x22 adds x9, x9, x22, lsr#63 eor x11, x11, x22 adcs x10, x10, xzr eor x12, x12, x22 adcs x11, x11, xzr eor x13, x13, x22 adcs x12, x12, xzr eor x14, x14, x22 adcs x13, x13, xzr adc x14, x14, xzr adds x3, x3, x9 // final adjustment for |mod|<<384 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*6] adcs x7, x7, x13 stp x5, x6, [x0,#8*8] adc x8, x8, x14 stp x7, x8, [x0,#8*10] add sp, sp, #1056 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #29 ret //////////////////////////////////////////////////////////////////////// // see corresponding commentary in ctx_inverse_mod_384-x86_64... .def __smul_384x63; .type 32; .endef .p2align 5 __smul_384x63: ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) ldp x5, x6, [x1,#8*2+96] eor x20, x20, x17 // conditionally negate |f_| (or |g_|) ldp x7, x8, [x1,#8*4+96] eor x3, x3, x17 // conditionally negate |u| (or |v|) ldr x25, [x1,#8*6+96] sub x20, x20, x17 eor x4, x4, x17 adds x3, x3, x17, lsr#63 eor x5, x5, x17 adcs x4, x4, xzr eor x6, x6, x17 adcs x5, x5, xzr eor x7, x7, x17 adcs x6, x6, xzr umulh x22, x3, x20 eor x8, x8, x17 umulh x23, x4, x20 adcs x7, x7, xzr umulh x24, x5, x20 eor x25, x25, x17 mul x3, x3, x20 adcs x8, x8, xzr mul x4, x4, x20 adcs x25, x25, xzr cmp x20, #0 mul x5, x5, x20 csel x25, x25, xzr, ne adds x4, x4, x22 umulh x22, x6, x20 adcs x5, x5, x23 umulh x23, x7, x20 mul x6, x6, x20 mul x7, x7, x20 adcs x6, x6, x24 mul x27,x8, x20 adcs x7, x7, x22 adcs x27,x27,x23 adc x2, xzr, xzr ldp x9, x10, [x1,#8*0+160] // load |u| (or |v|) asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) ldp x11, x12, [x1,#8*2+160] eor x21, x21, x17 // conditionally negate |f_| (or |g_|) ldp x13, x14, [x1,#8*4+160] eor x9, x9, x17 // conditionally negate |u| (or |v|) ldr x26, [x1,#8*6+160] sub x21, x21, x17 eor x10, x10, x17 adds x9, x9, x17, lsr#63 eor x11, x11, x17 adcs x10, x10, xzr eor x12, x12, x17 adcs x11, x11, xzr eor x13, x13, x17 adcs x12, x12, xzr umulh x22, x9, x21 eor x14, x14, x17 umulh x23, x10, x21 adcs x13, x13, xzr umulh x24, x11, x21 eor x26, x26, x17 mul x9, x9, x21 adcs x14, x14, xzr mul x10, x10, x21 adcs x26, x26, xzr adc x19, xzr, xzr // used in __smul_768x63_tail cmp x21, #0 mul x11, x11, x21 csel x26, x26, xzr, ne adds x10, x10, x22 umulh x22, x12, x21 adcs x11, x11, x23 umulh x23, x13, x21 mul x12, x12, x21 mul x13, x13, x21 adcs x12, x12, x24 mul x28,x14, x21 adcs x13, x13, x22 adcs x28,x28,x23 adc x2, x2, xzr adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*0] adcs x7, x7, x13 stp x5, x6, [x0,#8*2] adcs x27, x27, x28 stp x7, x27, [x0,#8*4] ret .def __smul_768x63_tail; .type 32; .endef .p2align 5 __smul_768x63_tail: umulh x27, x8, x20 ldr x4, [x1,#8*27]// load rest of |v| adc x2, x2, xzr ldp x5, x6, [x1,#8*28] and x25, x25, x20 ldp x7, x8, [x1,#8*30] sub x27, x27, x25 // tie up |u|*|f1| chain umulh x14, x14, x21 // resume |v|*|g1| chain eor x4, x4, x17 // conditionally negate rest of |v| eor x5, x5, x17 eor x6, x6, x17 adds x4, x4, x19 eor x7, x7, x17 adcs x5, x5, xzr eor x8, x8, x17 adcs x6, x6, xzr umulh x22, x26, x21 adcs x7, x7, xzr umulh x23, x4, x21 adc x8, x8, xzr umulh x24, x5, x21 add x14, x14, x2 umulh x25, x6, x21 asr x28, x27, #63 umulh x2, x7, x21 mul x3, x26, x21 mul x4, x4, x21 mul x5, x5, x21 adds x3, x3, x14 mul x6, x6, x21 adcs x4, x4, x22 mul x7, x7, x21 adcs x5, x5, x23 mul x22, x8, x21 adcs x6, x6, x24 adcs x7, x7, x25 adcs x25, x22, x2 adc x26, xzr, xzr // used in the final step adds x3, x3, x27 adcs x4, x4, x28 adcs x5, x5, x28 adcs x6, x6, x28 stp x3, x4, [x0,#8*6] adcs x7, x7, x28 stp x5, x6, [x0,#8*8] adcs x25, x25, x28 // carry is used in the final step stp x7, x25, [x0,#8*10] ret .def __smul_384_n_shift_by_62; .type 32; .endef .p2align 5 __smul_384_n_shift_by_62: ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) ldp x5, x6, [x1,#8*2+0] eor x2, x15, x28 // conditionally negate |f0| (or |g0|) ldp x7, x8, [x1,#8*4+0] eor x3, x3, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 umulh x22, x3, x2 adcs x6, x6, xzr umulh x23, x4, x2 eor x8, x8, x28 mul x3, x3, x2 adcs x7, x7, xzr mul x4, x4, x2 adc x8, x8, xzr umulh x24, x5, x2 and x28, x28, x2 umulh x25, x6, x2 adds x4, x4, x22 mul x5, x5, x2 umulh x22, x7, x2 neg x28, x28 mul x6, x6, x2 adcs x5, x5, x23 umulh x23, x8, x2 mul x7, x7, x2 adcs x6, x6, x24 mul x8, x8, x2 adcs x7, x7, x25 adcs x8, x8, x22 adc x27, x23, x28 ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) ldp x11, x12, [x1,#8*2+48] eor x2, x16, x28 // conditionally negate |f0| (or |g0|) ldp x13, x14, [x1,#8*4+48] eor x9, x9, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x10, x10, x28 adds x9, x9, x28, lsr#63 eor x11, x11, x28 adcs x10, x10, xzr eor x12, x12, x28 adcs x11, x11, xzr eor x13, x13, x28 umulh x22, x9, x2 adcs x12, x12, xzr umulh x23, x10, x2 eor x14, x14, x28 mul x9, x9, x2 adcs x13, x13, xzr mul x10, x10, x2 adc x14, x14, xzr umulh x24, x11, x2 and x28, x28, x2 umulh x25, x12, x2 adds x10, x10, x22 mul x11, x11, x2 umulh x22, x13, x2 neg x28, x28 mul x12, x12, x2 adcs x11, x11, x23 umulh x23, x14, x2 mul x13, x13, x2 adcs x12, x12, x24 mul x14, x14, x2 adcs x13, x13, x25 adcs x14, x14, x22 adc x28, x23, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x27, x28 extr x3, x4, x3, #62 extr x4, x5, x4, #62 extr x5, x6, x5, #62 asr x28, x9, #63 extr x6, x7, x6, #62 extr x7, x8, x7, #62 extr x8, x9, x8, #62 eor x3, x3, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 adcs x6, x6, xzr eor x8, x8, x28 stp x3, x4, [x0,#8*0] adcs x7, x7, xzr stp x5, x6, [x0,#8*2] adc x8, x8, xzr stp x7, x8, [x0,#8*4] eor x15, x15, x28 eor x16, x16, x28 sub x15, x15, x28 sub x16, x16, x28 ret .def __ab_approximation_62; .type 32; .endef .p2align 4 __ab_approximation_62: ldp x7, x8, [x1,#8*4] ldp x13, x14, [x1,#8*10] ldp x5, x6, [x1,#8*2] ldp x11, x12, [x1,#8*8] .Lab_approximation_62_loaded: orr x22, x8, x14 // check top-most limbs, ... cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x22, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne ldp x3, x4, [x1,#8*0] ldp x9, x10, [x1,#8*6] cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x22, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x22, x8, x14 csel x13, x13, x10, ne clz x22, x22 cmp x22, #64 csel x22, x22, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x23, x22 lslv x8, x8, x22 // align high limbs to the left lslv x14, x14, x22 lsrv x7, x7, x23 lsrv x13, x13, x23 and x7, x7, x23, asr#6 and x13, x13, x23, asr#6 orr x8, x8, x7 orr x14, x14, x13 b __inner_loop_62 ret .def __inner_loop_62; .type 32; .endef .p2align 4 __inner_loop_62: mov x15, #1 // |f0|=1 mov x16, #0 // |g0|=0 mov x17, #0 // |f1|=0 mov x19, #1 // |g1|=1 .Loop_62: sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 subs x24, x9, x3 // |b_|-|a_| and x22, x9, x28 sbc x25, x14, x8 and x23, x14, x28 subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x22, x15 sbcs x27, x8, x23 mov x23, x16 csel x9, x9, x3, hs // |b_| = |a_| csel x14, x14, x8, hs csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x8, x27, x25, hs csel x15, x15, x17, hs // exchange |f0| and |f1| csel x17, x17, x22, hs csel x16, x16, x19, hs // exchange |g0| and |g1| csel x19, x19, x23, hs extr x3, x8, x3, #1 lsr x8, x8, #1 and x22, x17, x28 and x23, x19, x28 add x17, x17, x17 // |f1|<<=1 add x19, x19, x19 // |g1|<<=1 sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, .Loop_62 ret ================================================ FILE: build/coff/ct_is_square_mod_384-armv8.S ================================================ .text .globl ct_is_square_mod_384 .def ct_is_square_mod_384; .type 32; .endef .p2align 5 ct_is_square_mod_384: hint #25 stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #512 ldp x3, x4, [x0,#8*0] // load input ldp x5, x6, [x0,#8*2] ldp x7, x8, [x0,#8*4] add x0, sp, #255 // find closest 256-byte-aligned spot and x0, x0, #-256 // in the frame... #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif ldp x9, x10, [x1,#8*0] // load modulus ldp x11, x12, [x1,#8*2] ldp x13, x14, [x1,#8*4] stp x3, x4, [x0,#8*6] // copy input to |a| stp x5, x6, [x0,#8*8] stp x7, x8, [x0,#8*10] stp x9, x10, [x0,#8*0] // copy modulus to |b| stp x11, x12, [x0,#8*2] stp x13, x14, [x0,#8*4] eor x2, x2, x2 // init the .Legendre symbol mov x15, #24 // 24 is 768/30-1 b .Loop_is_square .p2align 4 .Loop_is_square: bl __ab_approximation_30 sub x15, x15, #1 eor x1, x0, #128 // pointer to dst |b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,csp,x1 #endif bl __smul_384_n_shift_by_30 mov x19, x16 // |f0| mov x20, x17 // |g0| add x1,x1,#8*6 bl __smul_384_n_shift_by_30 ldp x9, x10, [x1,#-8*6] eor x0, x0, #128 // flip-flop src |a|b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif and x27, x27, x9 // if |a| was negative, add x2, x2, x27, lsr#1 // adjust |L| cbnz x15, .Loop_is_square ////////////////////////////////////////// last iteration //bl __ab_approximation_30 // |a| and |b| are exact, //ldr x8, [x0,#8*6] // and loaded //ldr x14, [x0,#8*0] mov x15, #48 // 48 is 768%30 + 30 bl __inner_loop_48 ldr x30, [x29,#__SIZEOF_POINTER__] and x0, x2, #1 eor x0, x0, #1 add sp, sp, #512 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #29 ret .def __smul_384_n_shift_by_30; .type 32; .endef .p2align 5 __smul_384_n_shift_by_30: ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) ldp x5, x6, [x0,#8*2+0] eor x20, x20, x27 // conditionally negate |g1| (or |f1|) ldp x7, x8, [x0,#8*4+0] eor x3, x3, x27 // conditionally negate |b| (or |a|) sub x20, x20, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 umulh x21, x3, x20 adcs x6, x6, xzr umulh x22, x4, x20 eor x8, x8, x27 umulh x23, x5, x20 adcs x7, x7, xzr umulh x24, x6, x20 adc x8, x8, xzr umulh x25, x7, x20 and x28, x20, x27 umulh x26, x8, x20 neg x28, x28 mul x3, x3, x20 mul x4, x4, x20 mul x5, x5, x20 adds x4, x4, x21 mul x6, x6, x20 adcs x5, x5, x22 mul x7, x7, x20 adcs x6, x6, x23 mul x8, x8, x20 adcs x7, x7, x24 adcs x8, x8 ,x25 adc x26, x26, x28 ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) ldp x11, x12, [x0,#8*2+48] eor x19, x19, x27 // conditionally negate |g1| (or |f1|) ldp x13, x14, [x0,#8*4+48] eor x9, x9, x27 // conditionally negate |b| (or |a|) sub x19, x19, x27 eor x10, x10, x27 adds x9, x9, x27, lsr#63 eor x11, x11, x27 adcs x10, x10, xzr eor x12, x12, x27 adcs x11, x11, xzr eor x13, x13, x27 umulh x21, x9, x19 adcs x12, x12, xzr umulh x22, x10, x19 eor x14, x14, x27 umulh x23, x11, x19 adcs x13, x13, xzr umulh x24, x12, x19 adc x14, x14, xzr umulh x25, x13, x19 and x28, x19, x27 umulh x27, x14, x19 neg x28, x28 mul x9, x9, x19 mul x10, x10, x19 mul x11, x11, x19 adds x10, x10, x21 mul x12, x12, x19 adcs x11, x11, x22 mul x13, x13, x19 adcs x12, x12, x23 mul x14, x14, x19 adcs x13, x13, x24 adcs x14, x14 ,x25 adc x27, x27, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x26, x27 extr x3, x4, x3, #30 extr x4, x5, x4, #30 extr x5, x6, x5, #30 asr x27, x9, #63 extr x6, x7, x6, #30 extr x7, x8, x7, #30 extr x8, x9, x8, #30 eor x3, x3, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 adcs x6, x6, xzr eor x8, x8, x27 stp x3, x4, [x1,#8*0] adcs x7, x7, xzr stp x5, x6, [x1,#8*2] adc x8, x8, xzr stp x7, x8, [x1,#8*4] ret .def __ab_approximation_30; .type 32; .endef .p2align 4 __ab_approximation_30: ldp x13, x14, [x0,#8*4] // |a| is still in registers ldp x11, x12, [x0,#8*2] orr x21, x8, x14 // check top-most limbs, ... cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x21, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x21, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x21, x8, x14 // and one more, ... csel x13, x13, x10, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x3, ne orr x21, x8, x14 csel x13, x13, x9, ne clz x21, x21 cmp x21, #64 csel x21, x21, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x22, x21 lslv x8, x8, x21 // align high limbs to the left lslv x14, x14, x21 lsrv x7, x7, x22 lsrv x13, x13, x22 and x7, x7, x22, asr#6 and x13, x13, x22, asr#6 orr x8, x8, x7 orr x14, x14, x13 bfxil x8, x3, #0, #32 bfxil x14, x9, #0, #32 b __inner_loop_30 ret .def __inner_loop_30; .type 32; .endef .p2align 4 __inner_loop_30: mov x28, #30 mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x27,#0x7FFFFFFF7FFFFFFF .Loop_30: sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x8, x14 sub x28, x28, #1 and x21, x14, x24 sub x22, x14, x8 // |b_|-|a_| subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 mov x21, x20 csel x14, x14, x8, hs // |b_| = |a_| csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x20, x20, x17, hs // exchange |fg0| and |fg1| csel x17, x17, x21, hs csel x2, x2, x25, hs lsr x8, x8, #1 and x21, x20, x24 and x22, x27, x24 add x23, x14, #2 sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x20, x20, x20 // |f1|<<=1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 add x17, x17, x22 sub x20, x20, x27 cbnz x28, .Loop_30 mov x27, #0x7FFFFFFF ubfx x16, x17, #0, #32 ubfx x17, x17, #32, #32 ubfx x19, x20, #0, #32 ubfx x20, x20, #32, #32 sub x16, x16, x27 // remove the bias sub x17, x17, x27 sub x19, x19, x27 sub x20, x20, x27 ret .def __inner_loop_48; .type 32; .endef .p2align 4 __inner_loop_48: .Loop_48: sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x3, x9 sub x15, x15, #1 and x21, x9, x24 sub x22, x9, x3 // |b_|-|a_| subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 csel x9, x9, x3, hs // |b_| = |a_| csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x2, x2, x25, hs add x23, x9, #2 lsr x3, x3, #1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 cbnz x15, .Loop_48 ret ================================================ FILE: build/coff/ct_is_square_mod_384-x86_64.s ================================================ .text .globl ct_is_square_mod_384 .def ct_is_square_mod_384; .scl 2; .type 32; .endef .p2align 5 ct_is_square_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ct_is_square_mod_384: pushq %rbp movq %rcx,%rdi movq %rdx,%rsi pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $536,%rsp .LSEH_body_ct_is_square_mod_384: leaq 24+255(%rsp),%rax andq $-256,%rax #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rbx movq 24(%rsi),%rcx movq 32(%rsi),%rdx movq 40(%rsi),%rdi movq %rax,%rsi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rcx,72(%rax) movq %rdx,80(%rax) movq %rdi,88(%rax) xorq %rbp,%rbp movl $24,%ecx jmp .Loop_is_square .p2align 5 .Loop_is_square: movl %ecx,16(%rsp) call __ab_approximation_30 movq %rax,0(%rsp) movq %rbx,8(%rsp) movq $128+48,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_30 movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq -48(%rdi),%rdi call __smulq_384_n_shift_by_30 movl 16(%rsp),%ecx xorq $128,%rsi andq 48(%rdi),%r14 shrq $1,%r14 addq %r14,%rbp subl $1,%ecx jnz .Loop_is_square movq 48(%rsi),%r9 call __inner_loop_48 movq $1,%rax andq %rbp,%rax xorq $1,%rax leaq 536(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_ct_is_square_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_ct_is_square_mod_384: .def __smulq_384_n_shift_by_30; .scl 3; .type 32; .endef .p2align 5 __smulq_384_n_shift_by_30: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %rdx,%r14 andq %rbx,%r14 mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %r14 mulq %rbx addq %rax,%r13 adcq %rdx,%r14 leaq 48(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %rdx,%r15 andq %rbx,%r15 mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %r15 mulq %rbx addq %rax,%r13 adcq %rdx,%r15 leaq -48(%rsi),%rsi addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 shrdq $30,%r9,%r8 shrdq $30,%r10,%r9 shrdq $30,%r11,%r10 shrdq $30,%r12,%r11 shrdq $30,%r13,%r12 shrdq $30,%r14,%r13 sarq $63,%r14 xorq %rbx,%rbx subq %r14,%rbx xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbx,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __ab_approximation_30; .scl 3; .type 32; .endef .p2align 5 __ab_approximation_30: .byte 0xf3,0x0f,0x1e,0xfa movq 88(%rsi),%rbx movq 80(%rsi),%r15 movq 72(%rsi),%r14 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r11,%r12 movq 64(%rsi),%r11 cmovzq %r14,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r10,%r12 movq 56(%rsi),%r10 cmovzq %r11,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r9,%r12 movq 48(%rsi),%r9 cmovzq %r10,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r8,%r12 cmovzq %r9,%r15 movq %r13,%rax orq %rbx,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r13 cmovzq %r9,%rbx cmovzq %rax,%rcx negq %rcx shldq %cl,%r12,%r13 shldq %cl,%r15,%rbx movq $0xFFFFFFFF00000000,%rax movl %r8d,%r8d movl %r9d,%r9d andq %rax,%r13 andq %rax,%rbx orq %r13,%r8 orq %rbx,%r9 jmp __inner_loop_30 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __inner_loop_30; .scl 3; .type 32; .endef .p2align 5 __inner_loop_30: .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rbx movq $0x800000007FFFFFFF,%rcx leaq -1(%rbx),%r15 movl $30,%edi .Loop_30: movq %r8,%rax andq %r9,%rax shrq $1,%rax cmpq %r9,%r8 movq %r8,%r10 movq %r9,%r11 leaq (%rax,%rbp,1),%rax movq %rbx,%r12 movq %rcx,%r13 movq %rbp,%r14 cmovbq %r9,%r8 cmovbq %r10,%r9 cmovbq %rcx,%rbx cmovbq %r12,%rcx cmovbq %rax,%rbp subq %r9,%r8 subq %rcx,%rbx addq %r15,%rbx testq $1,%r10 cmovzq %r10,%r8 cmovzq %r11,%r9 cmovzq %r12,%rbx cmovzq %r13,%rcx cmovzq %r14,%rbp leaq 2(%r9),%rax shrq $1,%r8 shrq $2,%rax addq %rcx,%rcx leaq (%rax,%rbp,1),%rbp subq %r15,%rcx subl $1,%edi jnz .Loop_30 shrq $32,%r15 movl %ebx,%eax shrq $32,%rbx movl %ecx,%edx shrq $32,%rcx subq %r15,%rax subq %r15,%rbx subq %r15,%rdx subq %r15,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __inner_loop_48; .scl 3; .type 32; .endef .p2align 5 __inner_loop_48: .byte 0xf3,0x0f,0x1e,0xfa movl $48,%edi .Loop_48: movq %r8,%rax andq %r9,%rax shrq $1,%rax cmpq %r9,%r8 movq %r8,%r10 movq %r9,%r11 leaq (%rax,%rbp,1),%rax movq %rbp,%r12 cmovbq %r9,%r8 cmovbq %r10,%r9 cmovbq %rax,%rbp subq %r9,%r8 testq $1,%r10 cmovzq %r10,%r8 cmovzq %r11,%r9 cmovzq %r12,%rbp leaq 2(%r9),%rax shrq $1,%r8 shrq $2,%rax addq %rax,%rbp subl $1,%edi jnz .Loop_48 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_ct_is_square_mod_384 .rva .LSEH_body_ct_is_square_mod_384 .rva .LSEH_info_ct_is_square_mod_384_prologue .rva .LSEH_body_ct_is_square_mod_384 .rva .LSEH_epilogue_ct_is_square_mod_384 .rva .LSEH_info_ct_is_square_mod_384_body .rva .LSEH_epilogue_ct_is_square_mod_384 .rva .LSEH_end_ct_is_square_mod_384 .rva .LSEH_info_ct_is_square_mod_384_epilogue .section .xdata .p2align 3 .LSEH_info_ct_is_square_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_ct_is_square_mod_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x43,0x00 .byte 0x00,0xe4,0x44,0x00 .byte 0x00,0xd4,0x45,0x00 .byte 0x00,0xc4,0x46,0x00 .byte 0x00,0x34,0x47,0x00 .byte 0x00,0x54,0x48,0x00 .byte 0x00,0x74,0x4a,0x00 .byte 0x00,0x64,0x4b,0x00 .byte 0x00,0x01,0x49,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_ct_is_square_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/ctq_inverse_mod_384-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .globl ct_inverse_mod_384 .def ct_inverse_mod_384; .scl 2; .type 32; .endef .p2align 5 ct_inverse_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ct_inverse_mod_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz ct_inverse_mod_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $1112,%rsp .LSEH_body_ct_inverse_mod_384: leaq 88+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 0(%rdx),%r14 movq 8(%rdx),%r15 movq 16(%rdx),%rbx movq 24(%rdx),%rbp movq 32(%rdx),%rsi movq 40(%rdx),%rdi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rbp,72(%rax) movq %rsi,80(%rax) movq %rax,%rsi movq %rdi,88(%rax) movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,96(%rdi) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,104(%rdi) xorq $256,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq 96(%rsi),%rax movq 152(%rsi),%r11 movq %rdx,%rbx movq %rax,%r10 imulq 56(%rsp) movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq 64(%rsp) addq %rax,%r8 adcq %rdx,%r9 movq %r8,48(%rdi) movq %r9,56(%rdi) sarq $63,%r9 movq %r9,64(%rdi) movq %r9,72(%rdi) movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) leaq 96(%rsi),%rsi movq %r10,%rax imulq %rbx movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq %rcx addq %rax,%r8 adcq %rdx,%r9 movq %r8,104(%rdi) movq %r9,112(%rdi) sarq $63,%r9 movq %r9,120(%rdi) movq %r9,128(%rdi) movq %r9,136(%rdi) movq %r9,144(%rdi) movq %r9,152(%rdi) xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 movq %r14,56(%rdi) movq %r14,64(%rdi) movq %r14,72(%rdi) movq %r14,80(%rdi) movq %r14,88(%rdi) xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 48(%rsi),%r10 movq 56(%rsi),%r11 call __inner_loop_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi movq %r8,0(%rdi) movq %r10,48(%rdi) leaq 96(%rsi),%rsi leaq 96(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $24,%edi movq 0(%rsi),%r8 xorq %r9,%r9 movq 48(%rsi),%r10 xorq %r11,%r11 call __inner_loop_62 leaq 96(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulq_768x63 movq 40(%rsp),%rsi movq %rdx,%r13 sarq $63,%r13 movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %rdx orq %rdx,%r13 sarq $63,%rdx movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 xorq %rdx,%r8 xorq %rsi,%rsi xorq %rdx,%r9 subq %rdx,%rsi xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) leaq 1112(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_ct_inverse_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_ct_inverse_mod_384: .def __smulq_768x63; .scl 3; .type 32; .endef .p2align 5 __smulq_768x63: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 56(%rsi),%rsi xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,0(%rdi) movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 movq %r9,8(%rdi) mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 movq %r10,16(%rdi) mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 movq %r11,24(%rdi) mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 movq %r12,32(%rdi) mulq %rbp addq %rax,%r13 adcq %rdx,%r14 movq %r13,40(%rdi) movq %r14,48(%rdi) sarq $63,%r14 movq %r14,56(%rdi) movq %rcx,%rdx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq 56(%rsi),%r15 movq 64(%rsi),%rbx movq 72(%rsi),%rbp movq 80(%rsi),%rcx movq 88(%rsi),%rdi movq %rdx,%rsi sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rsi addq %rax,%rsi xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 xorq %rdx,%rbx xorq %rdx,%rbp xorq %rdx,%rcx xorq %rdx,%rdi addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rbx adcq $0,%rbp adcq $0,%rcx adcq $0,%rdi mulq %rsi movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rsi addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rsi addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rsi addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rsi addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rsi addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rsi addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rsi addq %rax,%r15 movq %rbx,%rax adcq $0,%rdx movq %rdx,%rbx mulq %rsi addq %rax,%rbx movq %rbp,%rax adcq $0,%rdx movq %rdx,%rbp mulq %rsi addq %rax,%rbp movq %rcx,%rax adcq $0,%rdx movq %rdx,%rcx mulq %rsi addq %rax,%rcx movq %rdi,%rax adcq $0,%rdx movq %rdx,%rdi imulq %rsi movq 8(%rsp),%rsi addq %rdi,%rax adcq $0,%rdx addq 0(%rsi),%r8 adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 adcq 24(%rsi),%r11 adcq 32(%rsi),%r12 adcq 40(%rsi),%r13 adcq 48(%rsi),%r14 movq 56(%rsi),%rdi adcq %rdi,%r15 adcq %rdi,%rbx adcq %rdi,%rbp adcq %rdi,%rcx adcq %rdi,%rax adcq %rdi,%rdx leaq (%rsi),%rdi movq 16(%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __smulq_384x63; .scl 3; .type 32; .endef .p2align 5 __smulq_384x63: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq 56(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,%r15 movq %r14,%rbx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq -56(%rsi),%rsi addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq %r15,%r13 adcq %rbx,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __smulq_384_n_shift_by_62; .scl 3; .type 32; .endef .p2align 5 __smulq_384_n_shift_by_62: .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 movq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq 48(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 movq %rdx,%r15 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r15 negq %r15 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r15 leaq -48(%rsi),%rsi movq %rbx,%rdx addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 shrdq $62,%r9,%r8 shrdq $62,%r10,%r9 shrdq $62,%r11,%r10 shrdq $62,%r12,%r11 shrdq $62,%r13,%r12 shrdq $62,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __ab_approximation_62; .scl 3; .type 32; .endef .p2align 5 __ab_approximation_62: .byte 0xf3,0x0f,0x1e,0xfa movq 40(%rsi),%r9 movq 88(%rsi),%r11 movq 32(%rsi),%rbx movq 80(%rsi),%rbp movq 24(%rsi),%r8 movq 72(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 16(%rsi),%r8 movq 64(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 8(%rsi),%r8 movq 56(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 0(%rsi),%r8 movq 48(%rsi),%r10 movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 jmp __inner_loop_62 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __inner_loop_62; .scl 3; .type 32; .endef .p2align 3 .long 0 __inner_loop_62: .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq $1,%r13 movq %rsi,8(%rsp) .Loop_62: xorq %rax,%rax xorq %rbx,%rbx testq $1,%r8 movq %r10,%rbp movq %r11,%r14 cmovnzq %r10,%rax cmovnzq %r11,%rbx subq %r8,%rbp sbbq %r9,%r14 movq %r8,%r15 movq %r9,%rsi subq %rax,%r8 sbbq %rbx,%r9 cmovcq %rbp,%r8 cmovcq %r14,%r9 cmovcq %r15,%r10 cmovcq %rsi,%r11 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrdq $1,%r9,%r8 shrq $1,%r9 testq $1,%r15 cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%edi jnz .Loop_62 movq 8(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rax lfence jmpq *%rax ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_ct_inverse_mod_384 .rva .LSEH_body_ct_inverse_mod_384 .rva .LSEH_info_ct_inverse_mod_384_prologue .rva .LSEH_body_ct_inverse_mod_384 .rva .LSEH_epilogue_ct_inverse_mod_384 .rva .LSEH_info_ct_inverse_mod_384_body .rva .LSEH_epilogue_ct_inverse_mod_384 .rva .LSEH_end_ct_inverse_mod_384 .rva .LSEH_info_ct_inverse_mod_384_epilogue .section .xdata .p2align 3 .LSEH_info_ct_inverse_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_ct_inverse_mod_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x8b,0x00 .byte 0x00,0xe4,0x8c,0x00 .byte 0x00,0xd4,0x8d,0x00 .byte 0x00,0xc4,0x8e,0x00 .byte 0x00,0x34,0x8f,0x00 .byte 0x00,0x54,0x90,0x00 .byte 0x00,0x74,0x92,0x00 .byte 0x00,0x64,0x93,0x00 .byte 0x00,0x01,0x91,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_ct_inverse_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/ctx_inverse_mod_384-x86_64.s ================================================ .text .globl ctx_inverse_mod_384 .def ctx_inverse_mod_384; .scl 2; .type 32; .endef .p2align 5 ctx_inverse_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ctx_inverse_mod_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx ct_inverse_mod_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $1112,%rsp .LSEH_body_ctx_inverse_mod_384: leaq 88+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 0(%rdx),%r14 movq 8(%rdx),%r15 movq 16(%rdx),%rbx movq 24(%rdx),%rbp movq 32(%rdx),%rsi movq 40(%rdx),%rdi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rbp,72(%rax) movq %rsi,80(%rax) movq %rax,%rsi movq %rdi,88(%rax) movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,96(%rdi) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,104(%rdi) xorq $256,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq 96(%rsi),%rax movq 152(%rsi),%r11 movq %rdx,%rbx movq %rax,%r10 imulq 56(%rsp) movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq 64(%rsp) addq %rax,%r8 adcq %rdx,%r9 movq %r8,48(%rdi) movq %r9,56(%rdi) sarq $63,%r9 movq %r9,64(%rdi) movq %r9,72(%rdi) movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) leaq 96(%rsi),%rsi movq %r10,%rax imulq %rbx movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq %rcx addq %rax,%r8 adcq %rdx,%r9 movq %r8,104(%rdi) movq %r9,112(%rdi) sarq $63,%r9 movq %r9,120(%rdi) movq %r9,128(%rdi) movq %r9,136(%rdi) movq %r9,144(%rdi) movq %r9,152(%rdi) xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 movq %r14,56(%rdi) movq %r14,64(%rdi) movq %r14,72(%rdi) movq %r14,80(%rdi) movq %r14,88(%rdi) xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $55,%edi movq 0(%rsi),%r8 movq 48(%rsi),%r10 call __tail_loop_55 leaq 96(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulx_768x63 movq 40(%rsp),%rsi movq %rdx,%r13 sarq $63,%r13 movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %rdx orq %rdx,%r13 sarq $63,%rdx movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 xorq %rdx,%r8 xorq %rsi,%rsi xorq %rdx,%r9 subq %rdx,%rsi xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) leaq 1112(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_ctx_inverse_mod_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_ctx_inverse_mod_384: .def __smulx_768x63; .scl 3; .type 32; .endef .p2align 5 __smulx_768x63: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 56(%rsi),%rsi xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 xorq %rax,%r14 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax adcq %rbp,%r13 adcq %rax,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) sarq $63,%r14 movq %r14,56(%rdi) movq %rcx,%rdx movq %rcx,%rax movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq 56(%rsi),%r15 movq 64(%rsi),%rbx movq 72(%rsi),%rbp movq 80(%rsi),%rcx movq 88(%rsi),%rdi sarq $63,%rax xorq %rsi,%rsi subq %rax,%rsi xorq %rax,%rdx addq %rsi,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 xorq %rax,%r14 xorq %rax,%r15 xorq %rax,%rbx xorq %rax,%rbp xorq %rax,%rcx xorq %rdi,%rax addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rbx adcq $0,%rbp adcq $0,%rcx adcq $0,%rax mulxq %r8,%r8,%rsi mulxq %r9,%r9,%rdi addq %rsi,%r9 mulxq %r10,%r10,%rsi adcq %rdi,%r10 mulxq %r11,%r11,%rdi adcq %rsi,%r11 mulxq %r12,%r12,%rsi adcq %rdi,%r12 mulxq %r13,%r13,%rdi adcq %rsi,%r13 mulxq %r14,%r14,%rsi adcq %rdi,%r14 mulxq %r15,%r15,%rdi adcq %rsi,%r15 mulxq %rbx,%rbx,%rsi adcq %rdi,%rbx mulxq %rbp,%rbp,%rdi adcq %rsi,%rbp mulxq %rcx,%rcx,%rsi adcq %rdi,%rcx movq 8(%rsp),%rdi adcq $0,%rsi imulq %rdx addq %rsi,%rax adcq $0,%rdx addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 movq 56(%rdi),%rsi adcq %rsi,%r15 adcq %rsi,%rbx adcq %rsi,%rbp adcq %rsi,%rcx adcq %rsi,%rax adcq %rsi,%rdx movq 16(%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __smulx_384x63; .scl 3; .type 32; .endef .p2align 5 __smulx_384x63: .byte 0xf3,0x0f,0x1e,0xfa movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%r12 movq 0+40(%rsi),%r13 movq 0+48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rdx addq %rax,%rdx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 xorq %rbp,%r12 xorq %rbp,%r13 xorq %rbp,%r14 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax movq %rcx,%rdx adcq %rbp,%r13 adcq %rax,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,%r15 movq %r14,%rbx movq 56+0(%rsi),%r8 movq 56+8(%rsi),%r9 movq 56+16(%rsi),%r10 movq 56+24(%rsi),%r11 movq 56+32(%rsi),%r12 movq 56+40(%rsi),%r13 movq 56+48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rdx addq %rax,%rdx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 xorq %rbp,%r12 xorq %rbp,%r13 xorq %rbp,%r14 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax adcq %rbp,%r13 adcq %rax,%r14 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq %r15,%r13 adcq %rbx,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __smulx_384_n_shift_by_31; .scl 3; .type 32; .endef .p2align 5 __smulx_384_n_shift_by_31: .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%r12 movq 0+40(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 andq %rdx,%rax negq %rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r14 addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %r14,%r10 mulxq %r11,%r11,%r14 adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %r14,%r12 mulxq %r13,%r13,%r14 adcq %rbp,%r13 adcq %rax,%r14 movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,%r15 movq 48+0(%rsi),%r8 movq 48+8(%rsi),%r9 movq 48+16(%rsi),%r10 movq 48+24(%rsi),%r11 movq 48+32(%rsi),%r12 movq 48+40(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 andq %rdx,%rax negq %rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r14 addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %r14,%r10 mulxq %r11,%r11,%r14 adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %r14,%r12 mulxq %r13,%r13,%r14 adcq %rbp,%r13 adcq %rax,%r14 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 movq %rbx,%rdx shrdq $31,%r9,%r8 shrdq $31,%r10,%r9 shrdq $31,%r11,%r10 shrdq $31,%r12,%r11 shrdq $31,%r13,%r12 shrdq $31,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __smulx_191_n_shift_by_31; .scl 3; .type 32; .endef .p2align 5 __smulx_191_n_shift_by_31: .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %r10,%rax addq %rbp,%r8 adcq $0,%r9 adcq $0,%rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r10 addq %rbp,%r9 adcq $0,%r10 imulq %rdx addq %rax,%r10 adcq $0,%rdx movq %rdx,%r14 movq %rcx,%rdx movq 48+0(%rsi),%r11 movq 48+8(%rsi),%r12 movq 48+16(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r11 xorq %rax,%r12 xorq %r13,%rax addq %rbp,%r11 adcq $0,%r12 adcq $0,%rax mulxq %r11,%r11,%rbp mulxq %r12,%r12,%r13 addq %rbp,%r12 adcq $0,%r13 imulq %rdx addq %rax,%r13 adcq $0,%rdx addq %r8,%r11 adcq %r9,%r12 adcq %r10,%r13 adcq %rdx,%r14 movq %rbx,%rdx shrdq $31,%r12,%r11 shrdq $31,%r13,%r12 shrdq $31,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r11 adcq $0,%r12 adcq $0,%r13 movq %r11,0(%rdi) movq %r12,8(%rdi) movq %r13,16(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __ab_approximation_31; .scl 3; .type 32; .endef .p2align 5 __ab_approximation_31: .byte 0xf3,0x0f,0x1e,0xfa movq 40(%rsi),%r9 movq 88(%rsi),%r11 movq 32(%rsi),%rbx movq 80(%rsi),%rbp movq 24(%rsi),%r8 movq 72(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 16(%rsi),%r8 cmovzq %r10,%rbp movq 64(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 8(%rsi),%r8 cmovzq %r10,%rbp movq 56(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 0(%rsi),%r8 cmovzq %r10,%rbp movq 48(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r9 cmovzq %r10,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 movl $0x7FFFFFFF,%eax andq %rax,%r8 andq %rax,%r10 andnq %r9,%rax,%r9 andnq %r11,%rax,%r11 orq %r9,%r8 orq %r11,%r10 jmp __inner_loop_31 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __inner_loop_31; .scl 3; .type 32; .endef .p2align 5 __inner_loop_31: .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rcx movq $0x800000007FFFFFFF,%r13 movq $0x7FFFFFFF7FFFFFFF,%r15 .Loop_31: cmpq %r10,%r8 movq %r8,%rax movq %r10,%rbx movq %rcx,%rbp movq %r13,%r14 cmovbq %r10,%r8 cmovbq %rax,%r10 cmovbq %r13,%rcx cmovbq %rbp,%r13 subq %r10,%r8 subq %r13,%rcx addq %r15,%rcx testq $1,%rax cmovzq %rax,%r8 cmovzq %rbx,%r10 cmovzq %rbp,%rcx cmovzq %r14,%r13 shrq $1,%r8 addq %r13,%r13 subq %r15,%r13 subl $1,%edi jnz .Loop_31 shrq $32,%r15 movl %ecx,%edx movl %r13d,%r12d shrq $32,%rcx shrq $32,%r13 subq %r15,%rdx subq %r15,%rcx subq %r15,%r12 subq %r15,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .def __tail_loop_55; .scl 3; .type 32; .endef .p2align 5 __tail_loop_55: .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq $1,%r13 .Loop_55: xorq %rax,%rax testq $1,%r8 movq %r10,%rbx cmovnzq %r10,%rax subq %r8,%rbx movq %r8,%rbp subq %rax,%r8 cmovcq %rbx,%r8 cmovcq %rbp,%r10 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrq $1,%r8 testq $1,%rbp cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%edi jnz .Loop_55 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_ctx_inverse_mod_384 .rva .LSEH_body_ctx_inverse_mod_384 .rva .LSEH_info_ctx_inverse_mod_384_prologue .rva .LSEH_body_ctx_inverse_mod_384 .rva .LSEH_epilogue_ctx_inverse_mod_384 .rva .LSEH_info_ctx_inverse_mod_384_body .rva .LSEH_epilogue_ctx_inverse_mod_384 .rva .LSEH_end_ctx_inverse_mod_384 .rva .LSEH_info_ctx_inverse_mod_384_epilogue .section .xdata .p2align 3 .LSEH_info_ctx_inverse_mod_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_ctx_inverse_mod_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x8b,0x00 .byte 0x00,0xe4,0x8c,0x00 .byte 0x00,0xd4,0x8d,0x00 .byte 0x00,0xc4,0x8e,0x00 .byte 0x00,0x34,0x8f,0x00 .byte 0x00,0x54,0x90,0x00 .byte 0x00,0x74,0x92,0x00 .byte 0x00,0x64,0x93,0x00 .byte 0x00,0x01,0x91,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_ctx_inverse_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/div3w-armv8.S ================================================ .text .globl div_3_limbs .def div_3_limbs; .type 32; .endef .p2align 5 div_3_limbs: hint #34 ldp x4,x5,[x0] // load R eor x0,x0,x0 // Q = 0 mov x3,#64 // loop counter nop .Loop: subs x6,x4,x1 // R - D add x0,x0,x0 // Q <<= 1 sbcs x7,x5,x2 add x0,x0,#1 // Q + speculative bit csel x4,x4,x6,lo // select between R and R - D extr x1,x2,x1,#1 // D >>= 1 csel x5,x5,x7,lo lsr x2,x2,#1 sbc x0,x0,xzr // subtract speculative bit sub x3,x3,#1 cbnz x3,.Loop asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit orr x0,x0,x3 // all ones if overflow ret .globl quot_rem_128 .def quot_rem_128; .type 32; .endef .p2align 5 quot_rem_128: hint #34 ldp x3,x4,[x1] mul x5,x3,x2 // divisor[0:1} * quotient umulh x6,x3,x2 mul x11, x4,x2 umulh x7,x4,x2 ldp x8,x9,[x0] // load 3 limbs of the dividend ldr x10,[x0,#16] adds x6,x6,x11 adc x7,x7,xzr subs x8,x8,x5 // dividend - divisor * quotient sbcs x9,x9,x6 sbcs x10,x10,x7 sbc x5,xzr,xzr // borrow -> mask add x2,x2,x5 // if borrowed, adjust the quotient ... and x3,x3,x5 and x4,x4,x5 adds x8,x8,x3 // ... and add divisor adc x9,x9,x4 stp x8,x9,[x0] // save 2 limbs of the remainder str x2,[x0,#16] // and one limb of the quotient mov x0,x2 // return adjusted quotient ret .globl quot_rem_64 .def quot_rem_64; .type 32; .endef .p2align 5 quot_rem_64: hint #34 ldr x3,[x1] ldr x8,[x0] // load 1 limb of the dividend mul x5,x3,x2 // divisor * quotient sub x8,x8,x5 // dividend - divisor * quotient stp x8,x2,[x0] // save remainder and quotient mov x0,x2 // return quotient ret ================================================ FILE: build/coff/div3w-x86_64.s ================================================ .text .globl div_3_limbs .def div_3_limbs; .scl 2; .type 32; .endef .p2align 5 div_3_limbs: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_div_3_limbs: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx .LSEH_body_div_3_limbs: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq (%rdi),%r8 movq 8(%rdi),%r9 xorq %rax,%rax movl $64,%ecx .Loop: movq %r8,%r10 subq %rsi,%r8 movq %r9,%r11 sbbq %rdx,%r9 leaq 1(%rax,%rax,1),%rax movq %rdx,%rdi cmovcq %r10,%r8 cmovcq %r11,%r9 sbbq $0,%rax shlq $63,%rdi shrq $1,%rsi shrq $1,%rdx orq %rdi,%rsi subl $1,%ecx jnz .Loop leaq 1(%rax,%rax,1),%rcx sarq $63,%rax subq %rsi,%r8 sbbq %rdx,%r9 sbbq $0,%rcx orq %rcx,%rax .LSEH_epilogue_div_3_limbs: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_div_3_limbs: .globl quot_rem_128 .def quot_rem_128; .scl 2; .type 32; .endef .p2align 5 quot_rem_128: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_quot_rem_128: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx .LSEH_body_quot_rem_128: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq %rdx,%rax movq %rdx,%rcx mulq 0(%rsi) movq %rax,%r8 movq %rcx,%rax movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r9 adcq $0,%rdx movq 0(%rdi),%r10 movq 8(%rdi),%r11 movq 16(%rdi),%rax subq %r8,%r10 sbbq %r9,%r11 sbbq %rdx,%rax sbbq %r8,%r8 addq %r8,%rcx movq %r8,%r9 andq 0(%rsi),%r8 andq 8(%rsi),%r9 addq %r8,%r10 adcq %r9,%r11 movq %r10,0(%rdi) movq %r11,8(%rdi) movq %rcx,16(%rdi) movq %rcx,%rax .LSEH_epilogue_quot_rem_128: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_quot_rem_128: .globl quot_rem_64 .def quot_rem_64; .scl 2; .type 32; .endef .p2align 5 quot_rem_64: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_quot_rem_64: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx .LSEH_body_quot_rem_64: #ifdef __SGX_LVI_HARDENING__ lfence #endif movq %rdx,%rax imulq 0(%rsi),%rdx movq 0(%rdi),%r10 subq %rdx,%r10 movq %r10,0(%rdi) movq %rax,8(%rdi) .LSEH_epilogue_quot_rem_64: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_quot_rem_64: .section .pdata .p2align 2 .rva .LSEH_begin_div_3_limbs .rva .LSEH_body_div_3_limbs .rva .LSEH_info_div_3_limbs_prologue .rva .LSEH_body_div_3_limbs .rva .LSEH_epilogue_div_3_limbs .rva .LSEH_info_div_3_limbs_body .rva .LSEH_epilogue_div_3_limbs .rva .LSEH_end_div_3_limbs .rva .LSEH_info_div_3_limbs_epilogue .rva .LSEH_begin_quot_rem_128 .rva .LSEH_body_quot_rem_128 .rva .LSEH_info_quot_rem_128_prologue .rva .LSEH_body_quot_rem_128 .rva .LSEH_epilogue_quot_rem_128 .rva .LSEH_info_quot_rem_128_body .rva .LSEH_epilogue_quot_rem_128 .rva .LSEH_end_quot_rem_128 .rva .LSEH_info_quot_rem_128_epilogue .rva .LSEH_begin_quot_rem_64 .rva .LSEH_body_quot_rem_64 .rva .LSEH_info_quot_rem_64_prologue .rva .LSEH_body_quot_rem_64 .rva .LSEH_epilogue_quot_rem_64 .rva .LSEH_info_quot_rem_64_body .rva .LSEH_epilogue_quot_rem_64 .rva .LSEH_end_quot_rem_64 .rva .LSEH_info_quot_rem_64_epilogue .section .xdata .p2align 3 .LSEH_info_div_3_limbs_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_div_3_limbs_body: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_div_3_limbs_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_quot_rem_128_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_quot_rem_128_body: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_quot_rem_128_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_quot_rem_64_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_quot_rem_64_body: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_quot_rem_64_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/mul_mont_256-armv8.S ================================================ .text .globl mul_mont_sparse_256 .def mul_mont_sparse_256; .type 32; .endef .p2align 5 mul_mont_sparse_256: hint #34 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldr x9, [x2] ldp x12,x13,[x1,#16] mul x19,x10,x9 ldp x5,x6,[x3] mul x20,x11,x9 ldp x7,x8,[x3,#16] mul x21,x12,x9 mul x22,x13,x9 umulh x14,x10,x9 umulh x15,x11,x9 mul x3,x4,x19 umulh x16,x12,x9 umulh x17,x13,x9 adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,xzr, x17 mul x17,x8,x3 ldr x9,[x2,8*1] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*2] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*3] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 adcs x20,x21,x15 adcs x21,x22,x16 adcs x22,x23,x17 adc x23,xzr,xzr subs x14,x19,x5 sbcs x15,x20,x6 sbcs x16,x21,x7 sbcs x17,x22,x8 sbcs xzr, x23,xzr csel x19,x19,x14,lo csel x20,x20,x15,lo csel x21,x21,x16,lo csel x22,x22,x17,lo stp x19,x20,[x0] stp x21,x22,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ ret .globl sqr_mont_sparse_256 .def sqr_mont_sparse_256; .type 32; .endef .p2align 5 sqr_mont_sparse_256: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x5,x6,[x1] ldp x7,x8,[x1,#16] mov x4,x3 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x11,x6,x5 // a[1]*a[0] umulh x15,x6,x5 mul x12,x7,x5 // a[2]*a[0] umulh x16,x7,x5 mul x13,x8,x5 // a[3]*a[0] umulh x19,x8,x5 adds x12,x12,x15 // accumulate high parts of multiplication mul x14,x7,x6 // a[2]*a[1] umulh x15,x7,x6 adcs x13,x13,x16 mul x16,x8,x6 // a[3]*a[1] umulh x17,x8,x6 adc x19,x19,xzr // can't overflow mul x20,x8,x7 // a[3]*a[2] umulh x21,x8,x7 adds x15,x15,x16 // accumulate high parts of multiplication mul x10,x5,x5 // a[0]*a[0] adc x16,x17,xzr // can't overflow adds x13,x13,x14 // accumulate low parts of multiplication umulh x5,x5,x5 adcs x19,x19,x15 mul x15,x6,x6 // a[1]*a[1] adcs x20,x20,x16 umulh x6,x6,x6 adc x21,x21,xzr // can't overflow adds x11,x11,x11 // acc[1-6]*=2 mul x16,x7,x7 // a[2]*a[2] adcs x12,x12,x12 umulh x7,x7,x7 adcs x13,x13,x13 mul x17,x8,x8 // a[3]*a[3] adcs x19,x19,x19 umulh x8,x8,x8 adcs x20,x20,x20 adcs x21,x21,x21 adc x22,xzr,xzr adds x11,x11,x5 // +a[i]*a[i] adcs x12,x12,x15 adcs x13,x13,x6 adcs x19,x19,x16 adcs x20,x20,x7 adcs x21,x21,x17 adc x22,x22,x8 bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] adds x10,x10,x19 // accumulate upper half adcs x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adc x19,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x19,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl from_mont_256 .def from_mont_256; .type 32; .endef .p2align 5 from_mont_256: hint #25 stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #29 ret .globl redc_mont_256 .def redc_mont_256; .type 32; .endef .p2align 5 redc_mont_256: hint #25 stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x14,x15,[x1,#32] ldp x16,x17,[x1,#48] adds x10,x10,x14 adcs x11,x11,x15 adcs x12,x12,x16 adcs x13,x13,x17 adc x9,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x9,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #29 ret .def __mul_by_1_mont_256; .type 32; .endef .p2align 5 __mul_by_1_mont_256: mul x3,x4,x10 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 adc x13,x9,x17 ret ================================================ FILE: build/coff/mul_mont_384-armv8.S ================================================ .text .globl add_mod_384x384 .def add_mod_384x384; .type 32; .endef .p2align 5 add_mod_384x384: hint #25 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __add_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #29 ret .def __add_mod_384x384; .type 32; .endef .p2align 5 __add_mod_384x384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 stp x11, x12, [x0] adcs x15,x15,x23 ldp x11, x12, [x1,#48] adcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] adcs x11,x11,x19 stp x15, x16, [x0,#32] adcs x12,x12,x20 ldp x15, x16, [x1,#80] adcs x13,x13,x21 ldp x23,x24,[x2,#80] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo stp x11,x12,[x0,#48] csel x15,x15,x23,lo stp x13,x14,[x0,#64] csel x16,x16,x24,lo stp x15,x16,[x0,#80] ret .globl sub_mod_384x384 .def sub_mod_384x384; .type 32; .endef .p2align 5 sub_mod_384x384: hint #25 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #29 ret .def __sub_mod_384x384; .type 32; .endef .p2align 5 __sub_mod_384x384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 stp x11, x12, [x0] sbcs x15,x15,x23 ldp x11, x12, [x1,#48] sbcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] sbcs x11,x11,x19 stp x15, x16, [x0,#32] sbcs x12,x12,x20 ldp x15, x16, [x1,#80] sbcs x13,x13,x21 ldp x23,x24,[x2,#80] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] ret .def __add_mod_384; .type 32; .endef .p2align 5 __add_mod_384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo stp x11,x12,[x0] csel x16,x16,x24,lo stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .def __sub_mod_384; .type 32; .endef .p2align 5 __sub_mod_384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0] adc x16,x16,x24 stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .globl mul_mont_384x .def mul_mont_384x; .type 32; .endef .p2align 5 mul_mont_384x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#288 // space for 3 768-bit vectors mov x26,x0 // save r_ptr mov x27,x1 // save b_ptr mov x28,x2 // save b_ptr add x0,sp,#0 bl __mul_384 add x1,x1,#48 add x2,x2,#48 add x0,sp,#96 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] sub x2,x1,#48 add x0,sp,#240 bl __add_mod_384 add x1,x28,#0 add x2,x28,#48 add x0,sp,#192 bl __add_mod_384 add x1,x0,#0 add x2,x0,#48 bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,x0 add x2,sp,#0 bl __sub_mod_384x384 add x2,sp,#96 bl __sub_mod_384x384 // t2 = t2-t0-t1 add x1,sp,#0 add x2,sp,#96 add x0,sp,#0 bl __sub_mod_384x384 // t0 = t0-t1 add x1,sp,#0 add x0,x26,#0 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 add x1,sp,#192 add x0,x0,#48 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#288 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl sqr_mont_384x .def sqr_mont_384x; .type 32; .endef .p2align 5 sqr_mont_384x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#96 // space for 2 384-bit vectors mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 add x0,sp,#0 bl __add_mod_384 // t0 = a->re + a->im add x0,sp,#48 bl __sub_mod_384 // t1 = a->re - a->im ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) adds x11,x11,x11 // add with itself adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x19,x11,x19,lo csel x20,x12,x20,lo csel x21,x13,x21,lo ldp x11,x12,[sp] csel x22,x14,x22,lo ldr x17, [sp,#48] csel x23,x15,x23,lo ldp x13,x14,[sp,#16] csel x24,x16,x24,lo ldp x15,x16,[sp,#32] stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] add x2,sp,#48 bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl mul_mont_384 .def mul_mont_384; .type 32; .endef .p2align 5 mul_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .def __mul_mont_384; .type 32; .endef .p2align 5 __mul_mont_384: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 mov x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*1] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*2] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*3] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*4] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*5] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] // pull r_ptr adc x17,x17,xzr adds x19,x20,x26 adcs x20,x21,x27 adcs x21,x22,x28 adcs x22,x23,x0 adcs x23,x24,x1 adcs x24,x25,x3 adc x25,x17,xzr subs x26,x19,x5 sbcs x27,x20,x6 sbcs x28,x21,x7 sbcs x0,x22,x8 sbcs x1,x23,x9 sbcs x3,x24,x10 sbcs xzr, x25,xzr csel x11,x19,x26,lo csel x12,x20,x27,lo csel x13,x21,x28,lo csel x14,x22,x0,lo csel x15,x23,x1,lo csel x16,x24,x3,lo ret .globl sqr_mont_384 .def sqr_mont_384; .type 32; .endef .p2align 5 sqr_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 // space for 768-bit vector mov x4,x3 // adjust for missing b_ptr mov x3,x0 // save r_ptr mov x0,sp ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] mov x1,sp mov x0,x3 // restore r_ptr bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl sqr_n_mul_mont_383 .def sqr_n_mul_mont_383; .type 32; .endef .p2align 5 sqr_n_mul_mont_383: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#96 // space for 768-bit vector mov x17,x5 // save b_ptr ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mov x0,sp .Loop_sqr_383: bl __sqr_384 sub x2,x2,#1 // counter ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,sp bl __mul_by_1_mont_384 ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 // just accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 cbnz x2,.Loop_sqr_383 mov x2,x17 ldr x17,[x17] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .def __sqr_384; .type 32; .endef .p2align 5 __sqr_384: mul x19,x12,x11 mul x20,x13,x11 mul x21,x14,x11 mul x22,x15,x11 mul x23,x16,x11 umulh x6,x12,x11 umulh x7,x13,x11 umulh x8,x14,x11 umulh x9,x15,x11 adds x20,x20,x6 umulh x10,x16,x11 adcs x21,x21,x7 mul x7,x13,x12 adcs x22,x22,x8 mul x8,x14,x12 adcs x23,x23,x9 mul x9,x15,x12 adc x24,xzr, x10 mul x10,x16,x12 adds x21,x21,x7 umulh x7,x13,x12 adcs x22,x22,x8 umulh x8,x14,x12 adcs x23,x23,x9 umulh x9,x15,x12 adcs x24,x24,x10 umulh x10,x16,x12 adc x25,xzr,xzr mul x5,x11,x11 adds x22,x22,x7 umulh x11, x11,x11 adcs x23,x23,x8 mul x8,x14,x13 adcs x24,x24,x9 mul x9,x15,x13 adc x25,x25,x10 mul x10,x16,x13 adds x23,x23,x8 umulh x8,x14,x13 adcs x24,x24,x9 umulh x9,x15,x13 adcs x25,x25,x10 umulh x10,x16,x13 adc x26,xzr,xzr mul x6,x12,x12 adds x24,x24,x8 umulh x12, x12,x12 adcs x25,x25,x9 mul x9,x15,x14 adc x26,x26,x10 mul x10,x16,x14 adds x25,x25,x9 umulh x9,x15,x14 adcs x26,x26,x10 umulh x10,x16,x14 adc x27,xzr,xzr mul x7,x13,x13 adds x26,x26,x9 umulh x13, x13,x13 adc x27,x27,x10 mul x8,x14,x14 mul x10,x16,x15 umulh x14, x14,x14 adds x27,x27,x10 umulh x10,x16,x15 mul x9,x15,x15 adc x28,x10,xzr adds x19,x19,x19 adcs x20,x20,x20 adcs x21,x21,x21 adcs x22,x22,x22 adcs x23,x23,x23 adcs x24,x24,x24 adcs x25,x25,x25 adcs x26,x26,x26 umulh x15, x15,x15 adcs x27,x27,x27 mul x10,x16,x16 adcs x28,x28,x28 umulh x16, x16,x16 adc x1,xzr,xzr adds x19,x19,x11 adcs x20,x20,x6 adcs x21,x21,x12 adcs x22,x22,x7 adcs x23,x23,x13 adcs x24,x24,x8 adcs x25,x25,x14 stp x5,x19,[x0] adcs x26,x26,x9 stp x20,x21,[x0,#16] adcs x27,x27,x15 stp x22,x23,[x0,#32] adcs x28,x28,x10 stp x24,x25,[x0,#48] adc x16,x16,x1 stp x26,x27,[x0,#64] stp x28,x16,[x0,#80] ret .globl sqr_384 .def sqr_384; .type 32; .endef .p2align 5 sqr_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl redc_mont_384 .def redc_mont_384; .type 32; .endef .p2align 5 redc_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl from_mont_384 .def from_mont_384; .type 32; .endef .p2align 5 from_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .def __mul_by_1_mont_384; .type 32; .endef .p2align 5 __mul_by_1_mont_384: ldp x11,x12,[x1] ldp x13,x14,[x1,#16] mul x26,x4,x11 ldp x15,x16,[x1,#32] // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 ret .def __redc_tail_mont_384; .type 32; .endef .p2align 5 __redc_tail_mont_384: ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 // accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .globl mul_384 .def mul_384; .type 32; .endef .p2align 5 mul_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .def __mul_384; .type 32; .endef .p2align 5 __mul_384: ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 umulh x5,x11,x17 umulh x6,x12,x17 umulh x7,x13,x17 umulh x8,x14,x17 umulh x9,x15,x17 umulh x10,x16,x17 ldr x17,[x2,8*1] str x19,[x0] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,xzr, x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(1+1)] adc x25,xzr,xzr str x19,[x0,8*1] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(2+1)] adc x25,xzr,xzr str x19,[x0,8*2] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(3+1)] adc x25,xzr,xzr str x19,[x0,8*3] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(4+1)] adc x25,xzr,xzr str x19,[x0,8*4] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 adc x25,xzr,xzr str x19,[x0,8*5] adds x19,x20,x5 adcs x20,x21,x6 adcs x21,x22,x7 adcs x22,x23,x8 adcs x23,x24,x9 adc x24,x25,x10 stp x19,x20,[x0,#48] stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ret .globl mul_382x .def mul_382x; .type 32; .endef .p2align 5 mul_382x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 // space for two 384-bit vectors ldp x11,x12,[x1] mov x26,x0 // save r_ptr ldp x19,x20,[x1,#48] mov x27,x1 // save a_ptr ldp x13,x14,[x1,#16] mov x28,x2 // save b_ptr ldp x21,x22,[x1,#64] ldp x15,x16,[x1,#32] adds x5,x11,x19 // t0 = a->re + a->im ldp x23,x24,[x1,#80] adcs x6,x12,x20 ldp x11,x12,[x2] adcs x7,x13,x21 ldp x19,x20,[x2,#48] adcs x8,x14,x22 ldp x13,x14,[x2,#16] adcs x9,x15,x23 ldp x21,x22,[x2,#64] adc x10,x16,x24 ldp x15,x16,[x2,#32] stp x5,x6,[sp] adds x5,x11,x19 // t1 = b->re + b->im ldp x23,x24,[x2,#80] adcs x6,x12,x20 stp x7,x8,[sp,#16] adcs x7,x13,x21 adcs x8,x14,x22 stp x9,x10,[sp,#32] adcs x9,x15,x23 stp x5,x6,[sp,#48] adc x10,x16,x24 stp x7,x8,[sp,#64] stp x9,x10,[sp,#80] bl __mul_384 // mul_384(ret->re, a->re, b->re) add x1,sp,#0 add x2,sp,#48 add x0,x26,#96 bl __mul_384 add x1,x27,#48 add x2,x28,#48 add x0,sp,#0 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] add x1,x26,#96 add x2,sp,#0 add x0,x26,#96 bl __sub_mod_384x384 add x2,x26,#0 bl __sub_mod_384x384 add x1,x26,#0 add x2,sp,#0 add x0,x26,#0 bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl sqr_382x .def sqr_382x; .type 32; .endef .p2align 5 sqr_382x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x19,x20,[x1,#48] ldp x13,x14,[x1,#16] adds x5,x11,x19 // t0 = a->re + a->im ldp x21,x22,[x1,#64] adcs x6,x12,x20 ldp x15,x16,[x1,#32] adcs x7,x13,x21 ldp x23,x24,[x1,#80] adcs x8,x14,x22 stp x5,x6,[x0] adcs x9,x15,x23 ldp x5,x6,[x2] adc x10,x16,x24 stp x7,x8,[x0,#16] subs x11,x11,x19 // t1 = a->re - a->im ldp x7,x8,[x2,#16] sbcs x12,x12,x20 stp x9,x10,[x0,#32] sbcs x13,x13,x21 ldp x9,x10,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 adds x11,x11,x19 and x21,x7,x25 adcs x12,x12,x20 and x22,x8,x25 adcs x13,x13,x21 and x23,x9,x25 adcs x14,x14,x22 and x24,x10,x25 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] mov x4,x1 // save a_ptr add x1,x0,#0 add x2,x0,#48 bl __mul_384 add x1,x4,#0 add x2,x4,#48 add x0,x0,#96 bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x11,x12,[x0] ldp x13,x14,[x0,#16] adds x11,x11,x11 // add with itself ldp x15,x16,[x0,#32] adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adcs x19,x19,x19 adcs x20,x20,x20 stp x11,x12,[x0] adcs x21,x21,x21 stp x13,x14,[x0,#16] adcs x22,x22,x22 stp x15,x16,[x0,#32] adcs x23,x23,x23 stp x19,x20,[x0,#48] adc x24,x24,x24 stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl sqr_mont_382x .def sqr_mont_382x; .type 32; .endef .p2align 5 sqr_mont_382x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#112 // space for two 384-bit vectors + word mov x4,x3 // adjust for missing b_ptr ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x17,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x5,x11,x17 // t0 = a->re + a->im adcs x6,x12,x20 adcs x7,x13,x21 adcs x8,x14,x22 adcs x9,x15,x23 adc x10,x16,x24 subs x19,x11,x17 // t1 = a->re - a->im sbcs x20,x12,x20 sbcs x21,x13,x21 sbcs x22,x14,x22 sbcs x23,x15,x23 sbcs x24,x16,x24 sbc x25,xzr,xzr // borrow flag as mask stp x5,x6,[sp] stp x7,x8,[sp,#16] stp x9,x10,[sp,#32] stp x19,x20,[sp,#48] stp x21,x22,[sp,#64] stp x23,x24,[sp,#80] str x25,[sp,#96] ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) adds x19,x11,x11 // add with itself adcs x20,x12,x12 adcs x21,x13,x13 adcs x22,x14,x14 adcs x23,x15,x15 adc x24,x16,x16 stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] ldp x11,x12,[sp] ldr x17,[sp,#48] ldp x13,x14,[sp,#16] ldp x15,x16,[sp,#32] add x2,sp,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) ldr x30,[x29,#__SIZEOF_POINTER__] ldr x25,[sp,#96] // account for sign from a->re - a->im ldp x19,x20,[sp] ldp x21,x22,[sp,#16] ldp x23,x24,[sp,#32] and x19,x19,x25 and x20,x20,x25 and x21,x21,x25 and x22,x22,x25 and x23,x23,x25 and x24,x24,x25 subs x11,x11,x19 sbcs x12,x12,x20 sbcs x13,x13,x21 sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 and x21,x7,x25 and x22,x8,x25 and x23,x9,x25 and x24,x10,x25 adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#112 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .def __mul_mont_383_nonred; .type 32; .endef .p2align 5 __mul_mont_383_nonred: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 ldr x17,[x2,8*1] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*2] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*3] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*4] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*5] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] // pull r_ptr adds x11,x20,x26 adcs x12,x21,x27 adcs x13,x22,x28 adcs x14,x23,x0 adcs x15,x24,x1 adcs x16,x25,x3 ret .globl sgn0_pty_mont_384 .def sgn0_pty_mont_384; .type 32; .endef .p2align 5 sgn0_pty_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 adds x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl sgn0_pty_mont_384x .def sgn0_pty_mont_384x; .type 32; .endef .p2align 5 sgn0_pty_mont_384x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 add x1,x1,#48 and x2,x11,#1 orr x3,x11,x12 adds x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 orr x3,x3,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x2,x2,x17 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 orr x1,x11,x12 adds x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 orr x1,x1,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ================================================ FILE: build/coff/mulq_mont_256-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .globl mul_mont_sparse_256 .def mul_mont_sparse_256; .scl 2; .type 32; .endef .p2align 5 mul_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_mont_sparse_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_mont_sparse_256$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rdi .LSEH_body_mul_mont_sparse_256: movq 0(%rdx),%rax movq 0(%rsi),%r13 movq 8(%rsi),%r14 movq 16(%rsi),%r12 movq 24(%rsi),%rbp movq %rdx,%rbx movq %rax,%r15 mulq %r13 movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 call __mulq_mont_sparse_256 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_mul_mont_sparse_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_mont_sparse_256: .globl sqr_mont_sparse_256 .def sqr_mont_sparse_256; .scl 2; .type 32; .endef .p2align 5 sqr_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_sparse_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_sparse_256$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rdi .LSEH_body_sqr_mont_sparse_256: movq 0(%rsi),%rax movq %rcx,%r8 movq 8(%rsi),%r14 movq %rdx,%rcx movq 16(%rsi),%r12 leaq (%rsi),%rbx movq 24(%rsi),%rbp movq %rax,%r15 mulq %rax movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 call __mulq_mont_sparse_256 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sqr_mont_sparse_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_mont_sparse_256: .def __mulq_mont_sparse_256; .scl 3; .type 32; .endef .p2align 5 __mulq_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa mulq %r14 addq %rax,%r10 movq %r15,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r12 addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq 8(%rbx),%rax adcq $0,%rdx xorq %r14,%r14 movq %rdx,%r13 movq %r9,%rdi imulq %r8,%r9 movq %rax,%r15 mulq 0(%rsi) addq %rax,%r10 movq %r15,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r11 movq %r15,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r12 movq %r15,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq %rdx,%r14 xorq %r15,%r15 mulq 0(%rcx) addq %rax,%rdi movq %r9,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %rdi,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r12 movq 16(%rbx),%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx addq %rdx,%r13 adcq $0,%r14 adcq $0,%r15 movq %r10,%rdi imulq %r8,%r10 movq %rax,%r9 mulq 0(%rsi) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r12 movq %r9,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq %rdx,%r15 xorq %r9,%r9 mulq 0(%rcx) addq %rax,%rdi movq %r10,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %rdi,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r13 movq 24(%rbx),%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx addq %rdx,%r14 adcq $0,%r15 adcq $0,%r9 movq %r11,%rdi imulq %r8,%r11 movq %rax,%r10 mulq 0(%rsi) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r13 movq %r10,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %rbp,%r15 adcq %rdx,%r9 xorq %r10,%r10 mulq 0(%rcx) addq %rax,%rdi movq %r11,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %rdi,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx addq %rdx,%r15 adcq $0,%r9 adcq $0,%r10 imulq %r8,%rax movq 8(%rsp),%rsi movq %rax,%r11 mulq 0(%rcx) addq %rax,%r12 movq %r11,%rax adcq %rdx,%r12 mulq 8(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r12,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r14 movq %r11,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) movq %r14,%rbx addq %rbp,%r15 adcq $0,%rdx addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %rdx,%r9 adcq $0,%r10 movq %r15,%r12 subq 0(%rcx),%r13 sbbq 8(%rcx),%r14 sbbq 16(%rcx),%r15 movq %r9,%rbp sbbq 24(%rcx),%r9 sbbq $0,%r10 cmovcq %rax,%r13 cmovcq %rbx,%r14 cmovcq %r12,%r15 movq %r13,0(%rsi) cmovcq %rbp,%r9 movq %r14,8(%rsi) movq %r15,16(%rsi) movq %r9,24(%rsi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl from_mont_256 .def from_mont_256; .scl 2; .type 32; .endef .p2align 5 from_mont_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_from_mont_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz from_mont_256$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_from_mont_256: movq %rdx,%rbx call __mulq_by_1_mont_256 movq %r14,%r10 movq %r15,%r11 movq %r9,%r12 subq 0(%rbx),%r13 sbbq 8(%rbx),%r14 sbbq 16(%rbx),%r15 sbbq 24(%rbx),%r9 cmovncq %r13,%rax cmovncq %r14,%r10 cmovncq %r15,%r11 movq %rax,0(%rdi) cmovncq %r9,%r12 movq %r10,8(%rdi) movq %r11,16(%rdi) movq %r12,24(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_from_mont_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_from_mont_256: .globl redc_mont_256 .def redc_mont_256; .scl 2; .type 32; .endef .p2align 5 redc_mont_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redc_mont_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz redc_mont_256$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_redc_mont_256: movq %rdx,%rbx call __mulq_by_1_mont_256 addq 32(%rsi),%r13 adcq 40(%rsi),%r14 movq %r13,%rax adcq 48(%rsi),%r15 movq %r14,%r10 adcq 56(%rsi),%r9 sbbq %rsi,%rsi movq %r15,%r11 subq 0(%rbx),%r13 sbbq 8(%rbx),%r14 sbbq 16(%rbx),%r15 movq %r9,%r12 sbbq 24(%rbx),%r9 sbbq $0,%rsi cmovncq %r13,%rax cmovncq %r14,%r10 cmovncq %r15,%r11 movq %rax,0(%rdi) cmovncq %r9,%r12 movq %r10,8(%rdi) movq %r11,16(%rdi) movq %r12,24(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_redc_mont_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_redc_mont_256: .def __mulq_by_1_mont_256; .scl 3; .type 32; .endef .p2align 5 __mulq_by_1_mont_256: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 movq %rax,%r13 imulq %rcx,%rax movq %rax,%r9 mulq 0(%rbx) addq %rax,%r13 movq %r9,%rax adcq %rdx,%r13 mulq 8(%rbx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r13,%r10 adcq $0,%rdx movq %rdx,%r13 mulq 16(%rbx) movq %r10,%r14 imulq %rcx,%r10 addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r13,%r11 adcq $0,%rdx movq %rdx,%r13 mulq 24(%rbx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r13,%r12 adcq $0,%rdx movq %rdx,%r13 mulq 0(%rbx) addq %rax,%r14 movq %r10,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) movq %r11,%r15 imulq %rcx,%r11 addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r11,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) movq %r12,%r9 imulq %rcx,%r12 addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r9 movq %r12,%rax adcq %rdx,%r9 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_mul_mont_sparse_256 .rva .LSEH_body_mul_mont_sparse_256 .rva .LSEH_info_mul_mont_sparse_256_prologue .rva .LSEH_body_mul_mont_sparse_256 .rva .LSEH_epilogue_mul_mont_sparse_256 .rva .LSEH_info_mul_mont_sparse_256_body .rva .LSEH_epilogue_mul_mont_sparse_256 .rva .LSEH_end_mul_mont_sparse_256 .rva .LSEH_info_mul_mont_sparse_256_epilogue .rva .LSEH_begin_sqr_mont_sparse_256 .rva .LSEH_body_sqr_mont_sparse_256 .rva .LSEH_info_sqr_mont_sparse_256_prologue .rva .LSEH_body_sqr_mont_sparse_256 .rva .LSEH_epilogue_sqr_mont_sparse_256 .rva .LSEH_info_sqr_mont_sparse_256_body .rva .LSEH_epilogue_sqr_mont_sparse_256 .rva .LSEH_end_sqr_mont_sparse_256 .rva .LSEH_info_sqr_mont_sparse_256_epilogue .rva .LSEH_begin_from_mont_256 .rva .LSEH_body_from_mont_256 .rva .LSEH_info_from_mont_256_prologue .rva .LSEH_body_from_mont_256 .rva .LSEH_epilogue_from_mont_256 .rva .LSEH_info_from_mont_256_body .rva .LSEH_epilogue_from_mont_256 .rva .LSEH_end_from_mont_256 .rva .LSEH_info_from_mont_256_epilogue .rva .LSEH_begin_redc_mont_256 .rva .LSEH_body_redc_mont_256 .rva .LSEH_info_redc_mont_256_prologue .rva .LSEH_body_redc_mont_256 .rva .LSEH_epilogue_redc_mont_256 .rva .LSEH_info_redc_mont_256_body .rva .LSEH_epilogue_redc_mont_256 .rva .LSEH_end_redc_mont_256 .rva .LSEH_info_redc_mont_256_epilogue .section .xdata .p2align 3 .LSEH_info_mul_mont_sparse_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_sparse_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_from_mont_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_from_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_from_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redc_mont_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_redc_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redc_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/mulq_mont_384-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .def __subq_mod_384x384; .scl 3; .type 32; .endef .p2align 5 __subq_mod_384x384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __addq_mod_384; .scl 3; .type 32; .endef .p2align 5 __addq_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __subq_mod_384; .scl 3; .type 32; .endef .p2align 5 __subq_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __subq_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl mul_mont_384x .def mul_mont_384x; .scl 2; .type 32; .endef .p2align 5 mul_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_mont_384x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_mont_384x$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $328,%rsp .LSEH_body_mul_mont_384x: movq %rdx,%rbx movq %rdi,32(%rsp) movq %rsi,24(%rsp) movq %rdx,16(%rsp) movq %rcx,8(%rsp) movq %r8,0(%rsp) leaq 40(%rsp),%rdi call __mulq_384 leaq 48(%rbx),%rbx leaq 48(%rsi),%rsi leaq 40+96(%rsp),%rdi call __mulq_384 movq 8(%rsp),%rcx leaq -48(%rsi),%rdx leaq 40+192+48(%rsp),%rdi call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi call __addq_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi call __mulq_384 leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx call __subq_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx call __subq_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi call __subq_mod_384x384 movq %rcx,%rbx leaq 40(%rsp),%rsi movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_mul_mont_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_mont_384x: .globl sqr_mont_384x .def sqr_mont_384x; .scl 2; .type 32; .endef .p2align 5 sqr_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_384x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_384x$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_sqr_mont_384x: movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi call __subq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rbx movq 48(%rsi),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 call __mulq_mont_384 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 movq %r14,%r12 adcq %r9,%r9 movq %r15,%r13 adcq %r10,%r10 movq %r8,%rax adcq %r11,%r11 movq %r9,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %r10,%rbp sbbq 16(%rcx),%r8 sbbq 24(%rcx),%r9 sbbq 32(%rcx),%r10 movq %r11,%rsi sbbq 40(%rcx),%r11 sbbq $0,%rdx cmovcq %r12,%r14 cmovcq %r13,%r15 cmovcq %rax,%r8 movq %r14,48(%rdi) cmovcq %rbx,%r9 movq %r15,56(%rdi) cmovcq %rbp,%r10 movq %r8,64(%rdi) cmovcq %rsi,%r11 movq %r9,72(%rdi) movq %r10,80(%rdi) movq %r11,88(%rdi) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rax movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%r12 movq 32+24(%rsp),%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_sqr_mont_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_mont_384x: .globl mul_382x .def mul_382x; .scl 2; .type 32; .endef .p2align 5 mul_382x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_382x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_382x$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_mul_382x: leaq 96(%rdi),%rdi movq %rsi,0(%rsp) movq %rdx,8(%rsp) movq %rdi,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 48(%rsi),%r8 adcq 56(%rsi),%r9 adcq 64(%rsi),%r10 adcq 72(%rsi),%r11 adcq 80(%rsi),%r12 adcq 88(%rsi),%r13 movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq 32(%rdx),%r12 movq 40(%rdx),%r13 addq 48(%rdx),%r8 adcq 56(%rdx),%r9 adcq 64(%rdx),%r10 adcq 72(%rdx),%r11 adcq 80(%rdx),%r12 adcq 88(%rdx),%r13 movq %r8,32+48(%rsp) movq %r9,32+56(%rsp) movq %r10,32+64(%rsp) movq %r11,32+72(%rsp) movq %r12,32+80(%rsp) movq %r13,32+88(%rsp) leaq 32+0(%rsp),%rsi leaq 32+48(%rsp),%rbx call __mulq_384 movq 0(%rsp),%rsi movq 8(%rsp),%rbx leaq -96(%rdi),%rdi call __mulq_384 leaq 48(%rsi),%rsi leaq 48(%rbx),%rbx leaq 32(%rsp),%rdi call __mulq_384 movq 16(%rsp),%rsi leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi call __subq_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx call __subq_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi call __subq_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_mul_382x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_382x: .globl sqr_382x .def sqr_382x; .scl 2; .type 32; .endef .p2align 5 sqr_382x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_382x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_382x$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rsi .LSEH_body_sqr_382x: movq %rdx,%rcx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%rbx movq 32(%rsi),%rbp movq 40(%rsi),%rdx movq %r14,%r8 addq 48(%rsi),%r14 movq %r15,%r9 adcq 56(%rsi),%r15 movq %rax,%r10 adcq 64(%rsi),%rax movq %rbx,%r11 adcq 72(%rsi),%rbx movq %rbp,%r12 adcq 80(%rsi),%rbp movq %rdx,%r13 adcq 88(%rsi),%rdx movq %r14,0(%rdi) movq %r15,8(%rdi) movq %rax,16(%rdi) movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rdx,40(%rdi) leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi call __subq_mod_384_a_is_loaded leaq (%rdi),%rsi leaq -48(%rdi),%rbx leaq -48(%rdi),%rdi call __mulq_384 movq (%rsp),%rsi leaq 48(%rsi),%rbx leaq 96(%rdi),%rdi call __mulq_384 movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq 64(%rdi),%rax movq 72(%rdi),%rbx movq 80(%rdi),%rbp addq %r8,%r8 movq 88(%rdi),%rdx adcq %r9,%r9 movq %r8,0(%rdi) adcq %r10,%r10 movq %r9,8(%rdi) adcq %r11,%r11 movq %r10,16(%rdi) adcq %r12,%r12 movq %r11,24(%rdi) adcq %r13,%r13 movq %r12,32(%rdi) adcq %r14,%r14 movq %r13,40(%rdi) adcq %r15,%r15 movq %r14,48(%rdi) adcq %rax,%rax movq %r15,56(%rdi) adcq %rbx,%rbx movq %rax,64(%rdi) adcq %rbp,%rbp movq %rbx,72(%rdi) adcq %rdx,%rdx movq %rbp,80(%rdi) movq %rdx,88(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sqr_382x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_382x: .globl mul_384 .def mul_384; .scl 2; .type 32; .endef .p2align 5 mul_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_384$1 #endif pushq %rbp pushq %rbx pushq %r12 .LSEH_body_mul_384: movq %rdx,%rbx call __mulq_384 movq 0(%rsp),%r12 movq 8(%rsp),%rbx movq 16(%rsp),%rbp leaq 24(%rsp),%rsp .LSEH_epilogue_mul_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_384: .def __mulq_384; .scl 3; .type 32; .endef .p2align 5 __mulq_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rbx),%rax movq %rax,%rbp mulq 0(%rsi) movq %rax,0(%rdi) movq %rbp,%rax movq %rdx,%rcx mulq 8(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r11 movq 8(%rbx),%rax adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,8(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 16(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,16(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 24(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,24(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 32(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,32(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 40(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,40(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq %rax,%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rcx,48(%rdi) movq %r8,56(%rdi) movq %r9,64(%rdi) movq %r10,72(%rdi) movq %r11,80(%rdi) movq %r12,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sqr_384 .def sqr_384; .scl 2; .type 32; .endef .p2align 5 sqr_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_384: movq %rcx,%rdi movq %rdx,%rsi #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sqr_384: call __sqrq_384 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sqr_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_384: .def __sqrq_384; .scl 3; .type 32; .endef .p2align 5 __sqrq_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r15 movq 16(%rsi),%rcx movq 24(%rsi),%rbx movq %rax,%r14 mulq %r15 movq %rax,%r9 movq %r14,%rax movq 32(%rsi),%rbp movq %rdx,%r10 mulq %rcx addq %rax,%r10 movq %r14,%rax adcq $0,%rdx movq 40(%rsi),%rsi movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r14,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r14,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rsi addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rax xorq %r8,%r8 movq %rax,0(%rdi) movq %r15,%rax addq %r9,%r9 adcq $0,%r8 addq %rdx,%r9 adcq $0,%r8 movq %r9,8(%rdi) mulq %rcx addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r9 mulq %rbx addq %rax,%r12 movq %r15,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq %rbp addq %rax,%r13 movq %r15,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq %rsi addq %rax,%r14 movq %r15,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r15 mulq %rax xorq %r9,%r9 addq %rax,%r8 movq %rcx,%rax addq %r10,%r10 adcq %r11,%r11 adcq $0,%r9 addq %r8,%r10 adcq %rdx,%r11 adcq $0,%r9 movq %r10,16(%rdi) mulq %rbx addq %rax,%r13 movq %rcx,%rax adcq $0,%rdx movq %r11,24(%rdi) movq %rdx,%r8 mulq %rbp addq %rax,%r14 movq %rcx,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq %rsi addq %rax,%r15 movq %rcx,%rax adcq $0,%rdx addq %r8,%r15 adcq $0,%rdx movq %rdx,%rcx mulq %rax xorq %r11,%r11 addq %rax,%r9 movq %rbx,%rax addq %r12,%r12 adcq %r13,%r13 adcq $0,%r11 addq %r9,%r12 adcq %rdx,%r13 adcq $0,%r11 movq %r12,32(%rdi) mulq %rbp addq %rax,%r15 movq %rbx,%rax adcq $0,%rdx movq %r13,40(%rdi) movq %rdx,%r8 mulq %rsi addq %rax,%rcx movq %rbx,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%rbx mulq %rax xorq %r12,%r12 addq %rax,%r11 movq %rbp,%rax addq %r14,%r14 adcq %r15,%r15 adcq $0,%r12 addq %r11,%r14 adcq %rdx,%r15 movq %r14,48(%rdi) adcq $0,%r12 movq %r15,56(%rdi) mulq %rsi addq %rax,%rbx movq %rbp,%rax adcq $0,%rdx movq %rdx,%rbp mulq %rax xorq %r13,%r13 addq %rax,%r12 movq %rsi,%rax addq %rcx,%rcx adcq %rbx,%rbx adcq $0,%r13 addq %r12,%rcx adcq %rdx,%rbx movq %rcx,64(%rdi) adcq $0,%r13 movq %rbx,72(%rdi) mulq %rax addq %r13,%rax addq %rbp,%rbp adcq $0,%rdx addq %rbp,%rax adcq $0,%rdx movq %rax,80(%rdi) movq %rdx,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sqr_mont_384 .def sqr_mont_384; .scl 2; .type 32; .endef .p2align 5 sqr_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $120,%rsp .LSEH_body_sqr_mont_384: movq %rcx,96(%rsp) movq %rdx,104(%rsp) movq %rdi,112(%rsp) movq %rsp,%rdi call __sqrq_384 leaq 0(%rsp),%rsi movq 96(%rsp),%rcx movq 104(%rsp),%rbx movq 112(%rsp),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 120(%rsp),%r8 movq 120(%rsp),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_sqr_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_mont_384: .globl redc_mont_384 .def redc_mont_384; .scl 2; .type 32; .endef .p2align 5 redc_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redc_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz redc_mont_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_redc_mont_384: movq %rdx,%rbx call __mulq_by_1_mont_384 call __redq_tail_mont_384 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_redc_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_redc_mont_384: .globl from_mont_384 .def from_mont_384; .scl 2; .type 32; .endef .p2align 5 from_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_from_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz from_mont_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_from_mont_384: movq %rdx,%rbx call __mulq_by_1_mont_384 movq %r15,%rcx movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_from_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_from_mont_384: .def __mulq_by_1_mont_384; .scl 3; .type 32; .endef .p2align 5 __mulq_by_1_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rax,%r14 imulq %rcx,%rax movq %rax,%r8 mulq 0(%rbx) addq %rax,%r14 movq %r8,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %r14,%r9 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %r14,%r10 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %r9,%r15 imulq %rcx,%r9 addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 32(%rbx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 40(%rbx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r9,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r15,%r11 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %r10,%r8 imulq %rcx,%r10 addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 32(%rbx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rbx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r8 movq %r10,%rax adcq %rdx,%r8 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rbx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 mulq 24(%rbx) addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %r11,%r9 imulq %rcx,%r11 addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rbx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rbx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r8,%r15 adcq $0,%rdx movq %rdx,%r8 mulq 0(%rbx) addq %rax,%r9 movq %r11,%rax adcq %rdx,%r9 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rbx) addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %r12,%r10 imulq %rcx,%r12 addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rbx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rbx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 0(%rbx) addq %rax,%r10 movq %r12,%rax adcq %rdx,%r10 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rbx) addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %r13,%r11 imulq %rcx,%r13 addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rbx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r10,%r8 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rbx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 0(%rbx) addq %rax,%r11 movq %r13,%rax adcq %rdx,%r11 mulq 8(%rbx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rbx) addq %rax,%r8 movq %r13,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rbx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r11,%r9 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rbx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __redq_tail_mont_384; .scl 3; .type 32; .endef .p2align 5 __redq_tail_mont_384: .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 movq %r14,%rax adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 movq %r15,%rcx adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 sbbq %r12,%r12 movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sgn0_pty_mont_384 .def sgn0_pty_mont_384; .scl 2; .type 32; .endef .p2align 5 sgn0_pty_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sgn0_pty_mont_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sgn0_pty_mont_384: movq %rsi,%rbx leaq 0(%rdi),%rsi movq %rdx,%rcx call __mulq_by_1_mont_384 xorq %rax,%rax movq %r14,%r13 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax notq %rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sgn0_pty_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sgn0_pty_mont_384: .globl sgn0_pty_mont_384x .def sgn0_pty_mont_384x; .scl 2; .type 32; .endef .p2align 5 sgn0_pty_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mont_384x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sgn0_pty_mont_384x$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sgn0_pty_mont_384x: movq %rsi,%rbx leaq 48(%rdi),%rsi movq %rdx,%rcx call __mulq_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 leaq 0(%rdi),%rsi xorq %rdi,%rdi movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rdi subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rdi movq %r14,0(%rsp) notq %rdi andq $1,%r13 andq $2,%rdi orq %r13,%rdi call __mulq_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 xorq %rax,%rax movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax movq 0(%rsp),%r12 notq %rax testq %r14,%r14 cmovzq %rdi,%r13 testq %r12,%r12 cmovnzq %rdi,%rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sgn0_pty_mont_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sgn0_pty_mont_384x: .globl mul_mont_384 .def mul_mont_384; .scl 2; .type 32; .endef .p2align 5 mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_mont_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $24,%rsp .LSEH_body_mul_mont_384: movq 0(%rdx),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq %rdx,%rbx movq %r8,0(%rsp) movq %rdi,8(%rsp) call __mulq_mont_384 movq 24(%rsp),%r15 movq 32(%rsp),%r14 movq 40(%rsp),%r13 movq 48(%rsp),%r12 movq 56(%rsp),%rbx movq 64(%rsp),%rbp leaq 72(%rsp),%rsp .LSEH_epilogue_mul_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mul_mont_384: .def __mulq_mont_384; .scl 3; .type 32; .endef .p2align 5 __mulq_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rax,%rdi mulq %r14 movq %rax,%r8 movq %rdi,%rax movq %rdx,%r9 mulq %r15 addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r12 addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r11 movq %r8,%rbp imulq 8(%rsp),%r8 mulq %r13 addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r13 mulq 40(%rsi) addq %rax,%r13 movq %r8,%rax adcq $0,%rdx xorq %r15,%r15 movq %rdx,%r14 mulq 0(%rcx) addq %rax,%rbp movq %r8,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %rbp,%r9 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %rbp,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r13 movq 8(%rbx),%rax adcq $0,%rdx addq %rbp,%r13 adcq %rdx,%r14 adcq $0,%r15 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r8 mulq 8(%rsi) addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx addq %r8,%r10 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 movq %r9,%rbp imulq 8(%rsp),%r9 mulq 24(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rsi) addq %r8,%r14 adcq $0,%rdx xorq %r8,%r8 addq %rax,%r14 movq %r9,%rax adcq %rdx,%r15 adcq $0,%r8 mulq 0(%rcx) addq %rax,%rbp movq %r9,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %rbp,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r14 movq 16(%rbx),%rax adcq $0,%rdx addq %rbp,%r14 adcq %rdx,%r15 adcq $0,%r8 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx addq %r9,%r11 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 movq %r10,%rbp imulq 8(%rsp),%r10 mulq 24(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rsi) addq %r9,%r15 adcq $0,%rdx xorq %r9,%r9 addq %rax,%r15 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 mulq 0(%rcx) addq %rax,%rbp movq %r10,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r15 movq 24(%rbx),%rax adcq $0,%rdx addq %rbp,%r15 adcq %rdx,%r8 adcq $0,%r9 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r10 mulq 8(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r10,%r12 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 movq %r11,%rbp imulq 8(%rsp),%r11 mulq 24(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rsi) addq %r10,%r8 adcq $0,%rdx xorq %r10,%r10 addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 mulq 0(%rcx) addq %rax,%rbp movq %r11,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r14 adcq $0,%rdx addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %rbp,%r15 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r8 movq 32(%rbx),%rax adcq $0,%rdx addq %rbp,%r8 adcq %rdx,%r9 adcq $0,%r10 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r11 mulq 8(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 movq %r12,%rbp imulq 8(%rsp),%r12 mulq 24(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rsi) addq %rax,%r8 movq %rdi,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %r11,%r9 adcq $0,%rdx xorq %r11,%r11 addq %rax,%r9 movq %r12,%rax adcq %rdx,%r10 adcq $0,%r11 mulq 0(%rcx) addq %rax,%rbp movq %r12,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r15 adcq $0,%rdx addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %rbp,%r8 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r9 movq 40(%rbx),%rax adcq $0,%rdx addq %rbp,%r9 adcq %rdx,%r10 adcq $0,%r11 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r12 mulq 8(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r12,%r14 adcq $0,%rdx movq %rdx,%r12 mulq 16(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r12,%r15 adcq $0,%rdx movq %rdx,%r12 movq %r13,%rbp imulq 8(%rsp),%r13 mulq 24(%rsi) addq %rax,%r8 movq %rdi,%rax adcq $0,%rdx addq %r12,%r8 adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx addq %r12,%r9 adcq $0,%rdx movq %rdx,%r12 mulq 40(%rsi) addq %r12,%r10 adcq $0,%rdx xorq %r12,%r12 addq %rax,%r10 movq %r13,%rax adcq %rdx,%r11 adcq $0,%r12 mulq 0(%rcx) addq %rax,%rbp movq %r13,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %rbp,%r15 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r8 adcq $0,%rdx addq %rax,%r8 movq %r13,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %rbp,%r9 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %rbp,%r10 adcq %rdx,%r11 adcq $0,%r12 movq 16(%rsp),%rdi subq 0(%rcx),%r14 movq %r15,%rdx sbbq 8(%rcx),%r15 movq %r8,%rbx sbbq 16(%rcx),%r8 movq %r9,%rsi sbbq 24(%rcx),%r9 movq %r10,%rbp sbbq 32(%rcx),%r10 movq %r11,%r13 sbbq 40(%rcx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rdx,%r15 cmovcq %rbx,%r8 movq %r14,0(%rdi) cmovcq %rsi,%r9 movq %r15,8(%rdi) cmovcq %rbp,%r10 movq %r8,16(%rdi) cmovcq %r13,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sqr_n_mul_mont_384 .def sqr_n_mul_mont_384; .scl 2; .type 32; .endef .p2align 5 sqr_n_mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_n_mul_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_n_mul_mont_384$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_sqr_n_mul_mont_384: movq %r8,0(%rsp) movq %rdi,8(%rsp) movq %rcx,16(%rsp) leaq 32(%rsp),%rdi movq %r9,24(%rsp) movq (%r9),%xmm2 .Loop_sqr_384: movd %edx,%xmm1 call __sqrq_384 leaq 0(%rdi),%rsi movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 call __redq_tail_mont_384 movd %xmm1,%edx leaq 0(%rdi),%rsi decl %edx jnz .Loop_sqr_384 .byte 102,72,15,126,208 movq %rbx,%rcx movq 24(%rsp),%rbx movq %r8,%r12 movq %r9,%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 136(%rsp),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_sqr_n_mul_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_n_mul_mont_384: .globl sqr_n_mul_mont_383 .def sqr_n_mul_mont_383; .scl 2; .type 32; .endef .p2align 5 sqr_n_mul_mont_383: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_n_mul_mont_383: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_n_mul_mont_383$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_sqr_n_mul_mont_383: movq %r8,0(%rsp) movq %rdi,8(%rsp) movq %rcx,16(%rsp) leaq 32(%rsp),%rdi movq %r9,24(%rsp) movq (%r9),%xmm2 .Loop_sqr_383: movd %edx,%xmm1 call __sqrq_384 leaq 0(%rdi),%rsi movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 movd %xmm1,%edx addq 48(%rsi),%r14 adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 leaq 0(%rdi),%rsi movq %r14,0(%rdi) movq %r15,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) decl %edx jnz .Loop_sqr_383 .byte 102,72,15,126,208 movq %rbx,%rcx movq 24(%rsp),%rbx movq %r8,%r12 movq %r9,%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 136(%rsp),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_sqr_n_mul_mont_383: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_n_mul_mont_383: .def __mulq_mont_383_nonred; .scl 3; .type 32; .endef .p2align 5 __mulq_mont_383_nonred: .byte 0xf3,0x0f,0x1e,0xfa movq %rax,%rbp mulq %r14 movq %rax,%r8 movq %rbp,%rax movq %rdx,%r9 mulq %r15 addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r12 addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 movq %r8,%r15 imulq 8(%rsp),%r8 mulq %r13 addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r13 mulq 40(%rsi) addq %rax,%r13 movq %r8,%rax adcq $0,%rdx movq %rdx,%r14 mulq 0(%rcx) addq %rax,%r15 movq %r8,%rax adcq %rdx,%r15 mulq 8(%rcx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %r15,%r9 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rcx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rcx) addq %r15,%r11 adcq $0,%rdx addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%r15 mulq 32(%rcx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rcx) addq %rax,%r13 movq 8(%rbx),%rax adcq $0,%rdx addq %r15,%r13 adcq %rdx,%r14 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r15 mulq 8(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r15,%r11 adcq $0,%rdx movq %rdx,%r15 movq %r9,%r8 imulq 8(%rsp),%r9 mulq 24(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 32(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rsi) addq %r15,%r14 adcq $0,%rdx addq %rax,%r14 movq %r9,%rax adcq $0,%rdx movq %rdx,%r15 mulq 0(%rcx) addq %rax,%r8 movq %r9,%rax adcq %rdx,%r8 mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r8,%r10 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 24(%rcx) addq %r8,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %rdx,%r8 mulq 32(%rcx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rcx) addq %rax,%r14 movq 16(%rbx),%rax adcq $0,%rdx addq %r8,%r14 adcq %rdx,%r15 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r8 mulq 8(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 movq %r10,%r9 imulq 8(%rsp),%r10 mulq 24(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rsi) addq %r8,%r15 adcq $0,%rdx addq %rax,%r15 movq %r10,%rax adcq $0,%rdx movq %rdx,%r8 mulq 0(%rcx) addq %rax,%r9 movq %r10,%rax adcq %rdx,%r9 mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r9,%r11 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rcx) addq %r9,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %rdx,%r9 mulq 32(%rcx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rcx) addq %rax,%r15 movq 24(%rbx),%rax adcq $0,%rdx addq %r9,%r15 adcq %rdx,%r8 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 movq %r11,%r10 imulq 8(%rsp),%r11 mulq 24(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rsi) addq %r9,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq $0,%rdx movq %rdx,%r9 mulq 0(%rcx) addq %rax,%r10 movq %r11,%rax adcq %rdx,%r10 mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r10,%r12 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rcx) addq %r10,%r14 adcq $0,%rdx addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %rdx,%r10 mulq 32(%rcx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rcx) addq %rax,%r8 movq 32(%rbx),%rax adcq $0,%rdx addq %r10,%r8 adcq %rdx,%r9 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq 8(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 movq %r12,%r11 imulq 8(%rsp),%r12 mulq 24(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r10,%r8 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rsi) addq %r10,%r9 adcq $0,%rdx addq %rax,%r9 movq %r12,%rax adcq $0,%rdx movq %rdx,%r10 mulq 0(%rcx) addq %rax,%r11 movq %r12,%rax adcq %rdx,%r11 mulq 8(%rcx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rcx) addq %r11,%r15 adcq $0,%rdx addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %rdx,%r11 mulq 32(%rcx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rcx) addq %rax,%r9 movq 40(%rbx),%rax adcq $0,%rdx addq %r11,%r9 adcq %rdx,%r10 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq 8(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 movq %r13,%r12 imulq 8(%rsp),%r13 mulq 24(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r11,%r9 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %r11,%r10 adcq $0,%rdx addq %rax,%r10 movq %r13,%rax adcq $0,%rdx movq %rdx,%r11 mulq 0(%rcx) addq %rax,%r12 movq %r13,%rax adcq %rdx,%r12 mulq 8(%rcx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r12,%r14 adcq $0,%rdx movq %rdx,%r12 mulq 16(%rcx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r12,%r15 adcq $0,%rdx movq %rdx,%r12 mulq 24(%rcx) addq %r12,%r8 adcq $0,%rdx addq %rax,%r8 movq %r13,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rcx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r12,%r9 adcq $0,%rdx movq %rdx,%r12 mulq 40(%rcx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r12,%r10 adcq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sqr_mont_382x .def sqr_mont_382x; .scl 2; .type 32; .endef .p2align 5 sqr_mont_382x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_382x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_382x$1 #endif pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_sqr_mont_382x: movq %rcx,0(%rsp) movq %rdx,%rcx movq %rsi,16(%rsp) movq %rdi,24(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rdx adcq 72(%rsi),%r11 movq %r12,%rbx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rdx sbbq 80(%rsi),%rbx sbbq 88(%rsi),%rbp sbbq %rdi,%rdi movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq %r14,32+48(%rsp) movq %r15,32+56(%rsp) movq %rax,32+64(%rsp) movq %rdx,32+72(%rsp) movq %rbx,32+80(%rsp) movq %rbp,32+88(%rsp) movq %rdi,32+96(%rsp) leaq 48(%rsi),%rbx movq 48(%rsi),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq 24(%rsp),%rdi call __mulq_mont_383_nonred addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 movq %r14,48(%rdi) movq %r15,56(%rdi) movq %r8,64(%rdi) movq %r9,72(%rdi) movq %r10,80(%rdi) movq %r11,88(%rdi) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rax movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%r12 movq 32+24(%rsp),%r13 call __mulq_mont_383_nonred movq 32+96(%rsp),%rsi movq 32+0(%rsp),%r12 movq 32+8(%rsp),%r13 andq %rsi,%r12 movq 32+16(%rsp),%rax andq %rsi,%r13 movq 32+24(%rsp),%rbx andq %rsi,%rax movq 32+32(%rsp),%rbp andq %rsi,%rbx andq %rsi,%rbp andq 32+40(%rsp),%rsi subq %r12,%r14 movq 0(%rcx),%r12 sbbq %r13,%r15 movq 8(%rcx),%r13 sbbq %rax,%r8 movq 16(%rcx),%rax sbbq %rbx,%r9 movq 24(%rcx),%rbx sbbq %rbp,%r10 movq 32(%rcx),%rbp sbbq %rsi,%r11 sbbq %rsi,%rsi andq %rsi,%r12 andq %rsi,%r13 andq %rsi,%rax andq %rsi,%rbx andq %rsi,%rbp andq 40(%rcx),%rsi addq %r12,%r14 adcq %r13,%r15 adcq %rax,%r8 adcq %rbx,%r9 adcq %rbp,%r10 adcq %rsi,%r11 movq %r14,0(%rdi) movq %r15,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) leaq 136(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_sqr_mont_382x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqr_mont_382x: .section .pdata .p2align 2 .rva .LSEH_begin_mul_mont_384x .rva .LSEH_body_mul_mont_384x .rva .LSEH_info_mul_mont_384x_prologue .rva .LSEH_body_mul_mont_384x .rva .LSEH_epilogue_mul_mont_384x .rva .LSEH_info_mul_mont_384x_body .rva .LSEH_epilogue_mul_mont_384x .rva .LSEH_end_mul_mont_384x .rva .LSEH_info_mul_mont_384x_epilogue .rva .LSEH_begin_sqr_mont_384x .rva .LSEH_body_sqr_mont_384x .rva .LSEH_info_sqr_mont_384x_prologue .rva .LSEH_body_sqr_mont_384x .rva .LSEH_epilogue_sqr_mont_384x .rva .LSEH_info_sqr_mont_384x_body .rva .LSEH_epilogue_sqr_mont_384x .rva .LSEH_end_sqr_mont_384x .rva .LSEH_info_sqr_mont_384x_epilogue .rva .LSEH_begin_mul_382x .rva .LSEH_body_mul_382x .rva .LSEH_info_mul_382x_prologue .rva .LSEH_body_mul_382x .rva .LSEH_epilogue_mul_382x .rva .LSEH_info_mul_382x_body .rva .LSEH_epilogue_mul_382x .rva .LSEH_end_mul_382x .rva .LSEH_info_mul_382x_epilogue .rva .LSEH_begin_sqr_382x .rva .LSEH_body_sqr_382x .rva .LSEH_info_sqr_382x_prologue .rva .LSEH_body_sqr_382x .rva .LSEH_epilogue_sqr_382x .rva .LSEH_info_sqr_382x_body .rva .LSEH_epilogue_sqr_382x .rva .LSEH_end_sqr_382x .rva .LSEH_info_sqr_382x_epilogue .rva .LSEH_begin_mul_384 .rva .LSEH_body_mul_384 .rva .LSEH_info_mul_384_prologue .rva .LSEH_body_mul_384 .rva .LSEH_epilogue_mul_384 .rva .LSEH_info_mul_384_body .rva .LSEH_epilogue_mul_384 .rva .LSEH_end_mul_384 .rva .LSEH_info_mul_384_epilogue .rva .LSEH_begin_sqr_384 .rva .LSEH_body_sqr_384 .rva .LSEH_info_sqr_384_prologue .rva .LSEH_body_sqr_384 .rva .LSEH_epilogue_sqr_384 .rva .LSEH_info_sqr_384_body .rva .LSEH_epilogue_sqr_384 .rva .LSEH_end_sqr_384 .rva .LSEH_info_sqr_384_epilogue .rva .LSEH_begin_sqr_mont_384 .rva .LSEH_body_sqr_mont_384 .rva .LSEH_info_sqr_mont_384_prologue .rva .LSEH_body_sqr_mont_384 .rva .LSEH_epilogue_sqr_mont_384 .rva .LSEH_info_sqr_mont_384_body .rva .LSEH_epilogue_sqr_mont_384 .rva .LSEH_end_sqr_mont_384 .rva .LSEH_info_sqr_mont_384_epilogue .rva .LSEH_begin_redc_mont_384 .rva .LSEH_body_redc_mont_384 .rva .LSEH_info_redc_mont_384_prologue .rva .LSEH_body_redc_mont_384 .rva .LSEH_epilogue_redc_mont_384 .rva .LSEH_info_redc_mont_384_body .rva .LSEH_epilogue_redc_mont_384 .rva .LSEH_end_redc_mont_384 .rva .LSEH_info_redc_mont_384_epilogue .rva .LSEH_begin_from_mont_384 .rva .LSEH_body_from_mont_384 .rva .LSEH_info_from_mont_384_prologue .rva .LSEH_body_from_mont_384 .rva .LSEH_epilogue_from_mont_384 .rva .LSEH_info_from_mont_384_body .rva .LSEH_epilogue_from_mont_384 .rva .LSEH_end_from_mont_384 .rva .LSEH_info_from_mont_384_epilogue .rva .LSEH_begin_sgn0_pty_mont_384 .rva .LSEH_body_sgn0_pty_mont_384 .rva .LSEH_info_sgn0_pty_mont_384_prologue .rva .LSEH_body_sgn0_pty_mont_384 .rva .LSEH_epilogue_sgn0_pty_mont_384 .rva .LSEH_info_sgn0_pty_mont_384_body .rva .LSEH_epilogue_sgn0_pty_mont_384 .rva .LSEH_end_sgn0_pty_mont_384 .rva .LSEH_info_sgn0_pty_mont_384_epilogue .rva .LSEH_begin_sgn0_pty_mont_384x .rva .LSEH_body_sgn0_pty_mont_384x .rva .LSEH_info_sgn0_pty_mont_384x_prologue .rva .LSEH_body_sgn0_pty_mont_384x .rva .LSEH_epilogue_sgn0_pty_mont_384x .rva .LSEH_info_sgn0_pty_mont_384x_body .rva .LSEH_epilogue_sgn0_pty_mont_384x .rva .LSEH_end_sgn0_pty_mont_384x .rva .LSEH_info_sgn0_pty_mont_384x_epilogue .rva .LSEH_begin_mul_mont_384 .rva .LSEH_body_mul_mont_384 .rva .LSEH_info_mul_mont_384_prologue .rva .LSEH_body_mul_mont_384 .rva .LSEH_epilogue_mul_mont_384 .rva .LSEH_info_mul_mont_384_body .rva .LSEH_epilogue_mul_mont_384 .rva .LSEH_end_mul_mont_384 .rva .LSEH_info_mul_mont_384_epilogue .rva .LSEH_begin_sqr_n_mul_mont_384 .rva .LSEH_body_sqr_n_mul_mont_384 .rva .LSEH_info_sqr_n_mul_mont_384_prologue .rva .LSEH_body_sqr_n_mul_mont_384 .rva .LSEH_epilogue_sqr_n_mul_mont_384 .rva .LSEH_info_sqr_n_mul_mont_384_body .rva .LSEH_epilogue_sqr_n_mul_mont_384 .rva .LSEH_end_sqr_n_mul_mont_384 .rva .LSEH_info_sqr_n_mul_mont_384_epilogue .rva .LSEH_begin_sqr_n_mul_mont_383 .rva .LSEH_body_sqr_n_mul_mont_383 .rva .LSEH_info_sqr_n_mul_mont_383_prologue .rva .LSEH_body_sqr_n_mul_mont_383 .rva .LSEH_epilogue_sqr_n_mul_mont_383 .rva .LSEH_info_sqr_n_mul_mont_383_body .rva .LSEH_epilogue_sqr_n_mul_mont_383 .rva .LSEH_end_sqr_n_mul_mont_383 .rva .LSEH_info_sqr_n_mul_mont_383_epilogue .rva .LSEH_begin_sqr_mont_382x .rva .LSEH_body_sqr_mont_382x .rva .LSEH_info_sqr_mont_382x_prologue .rva .LSEH_body_sqr_mont_382x .rva .LSEH_epilogue_sqr_mont_382x .rva .LSEH_info_sqr_mont_382x_body .rva .LSEH_epilogue_sqr_mont_382x .rva .LSEH_end_sqr_mont_382x .rva .LSEH_info_sqr_mont_382x_epilogue .section .xdata .p2align 3 .LSEH_info_mul_mont_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x29,0x00 .byte 0x00,0xe4,0x2a,0x00 .byte 0x00,0xd4,0x2b,0x00 .byte 0x00,0xc4,0x2c,0x00 .byte 0x00,0x34,0x2d,0x00 .byte 0x00,0x54,0x2e,0x00 .byte 0x00,0x74,0x30,0x00 .byte 0x00,0x64,0x31,0x00 .byte 0x00,0x01,0x2f,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_382x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_382x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_382x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_384_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 .byte 0x00,0x34,0x01,0x00 .byte 0x00,0x54,0x02,0x00 .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .LSEH_info_mul_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_mont_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x0f,0x00 .byte 0x00,0xe4,0x10,0x00 .byte 0x00,0xd4,0x11,0x00 .byte 0x00,0xc4,0x12,0x00 .byte 0x00,0x34,0x13,0x00 .byte 0x00,0x54,0x14,0x00 .byte 0x00,0x74,0x16,0x00 .byte 0x00,0x64,0x17,0x00 .byte 0x00,0x01,0x15,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redc_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_redc_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redc_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_from_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_from_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_from_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sgn0_pty_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mont_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sgn0_pty_mont_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mul_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 .byte 0x00,0xe4,0x04,0x00 .byte 0x00,0xd4,0x05,0x00 .byte 0x00,0xc4,0x06,0x00 .byte 0x00,0x34,0x07,0x00 .byte 0x00,0x54,0x08,0x00 .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_n_mul_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_n_mul_mont_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_n_mul_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_n_mul_mont_383_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_n_mul_mont_383_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_n_mul_mont_383_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_382x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqr_mont_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/mulx_mont_256-x86_64.s ================================================ .text .globl mulx_mont_sparse_256 .def mulx_mont_sparse_256; .scl 2; .type 32; .endef .p2align 5 mulx_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_mont_sparse_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 mul_mont_sparse_256$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_mulx_mont_sparse_256: movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rbp movq 24(%rsi),%r9 leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%rax,%r11 call __mulx_mont_sparse_256 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_mulx_mont_sparse_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mulx_mont_sparse_256: .globl sqrx_mont_sparse_256 .def sqrx_mont_sparse_256; .scl 2; .type 32; .endef .p2align 5 sqrx_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_sparse_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx sqr_mont_sparse_256$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sqrx_mont_sparse_256: movq %rsi,%rbx movq %rcx,%r8 movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rbp movq 24(%rsi),%r9 leaq -128(%rbx),%rsi leaq -128(%rcx),%rcx mulxq %rdx,%rax,%r11 call __mulx_mont_sparse_256 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sqrx_mont_sparse_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_mont_sparse_256: .def __mulx_mont_sparse_256; .scl 3; .type 32; .endef .p2align 5 __mulx_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r15,%r12 mulxq %rbp,%rbp,%r13 addq %r15,%r11 mulxq %r9,%r9,%r14 movq 8(%rbx),%rdx adcq %rbp,%r12 adcq %r9,%r13 adcq $0,%r14 movq %rax,%r10 imulq %r8,%rax xorq %r15,%r15 mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r11 adcxq %r9,%r12 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r12 adcxq %r9,%r13 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r14 adcxq %r15,%r9 adoxq %r9,%r15 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r10 adoxq %r11,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r12 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r12 adoxq %r9,%r13 mulxq 24+128(%rcx),%rbp,%r9 movq 16(%rbx),%rdx adcxq %rbp,%r13 adoxq %r9,%r14 adcxq %r10,%r14 adoxq %r10,%r15 adcxq %r10,%r15 adoxq %r10,%r10 adcq $0,%r10 movq %rax,%r11 imulq %r8,%rax xorq %rbp,%rbp mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r12 adcxq %r9,%r13 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r14 adcxq %r9,%r15 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r15 adcxq %r10,%r9 adoxq %r9,%r10 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r11 adoxq %r12,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r13 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r13 adoxq %r9,%r14 mulxq 24+128(%rcx),%rbp,%r9 movq 24(%rbx),%rdx adcxq %rbp,%r14 adoxq %r9,%r15 adcxq %r11,%r15 adoxq %r11,%r10 adcxq %r11,%r10 adoxq %r11,%r11 adcq $0,%r11 movq %rax,%r12 imulq %r8,%rax xorq %rbp,%rbp mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r14 adcxq %r9,%r15 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r15 adcxq %r9,%r10 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r10 adcxq %r11,%r9 adoxq %r9,%r11 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r12 adoxq %r13,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r14 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r14 adoxq %r9,%r15 mulxq 24+128(%rcx),%rbp,%r9 movq %rax,%rdx adcxq %rbp,%r15 adoxq %r9,%r10 adcxq %r12,%r10 adoxq %r12,%r11 adcxq %r12,%r11 adoxq %r12,%r12 adcq $0,%r12 imulq %r8,%rdx xorq %rbp,%rbp mulxq 0+128(%rcx),%r13,%r9 adcxq %rax,%r13 adoxq %r9,%r14 mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%r14 adoxq %r9,%r15 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r15 adoxq %r9,%r10 mulxq 24+128(%rcx),%rbp,%r9 movq %r14,%rdx leaq 128(%rcx),%rcx adcxq %rbp,%r10 adoxq %r9,%r11 movq %r15,%rax adcxq %r13,%r11 adoxq %r13,%r12 adcq $0,%r12 movq %r10,%rbp subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 sbbq 16(%rcx),%r10 movq %r11,%r9 sbbq 24(%rcx),%r11 sbbq $0,%r12 cmovcq %rdx,%r14 cmovcq %rax,%r15 cmovcq %rbp,%r10 movq %r14,0(%rdi) cmovcq %r9,%r11 movq %r15,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl fromx_mont_256 .def fromx_mont_256; .scl 2; .type 32; .endef .p2align 5 fromx_mont_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_fromx_mont_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx from_mont_256$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_fromx_mont_256: movq %rdx,%rbx call __mulx_by_1_mont_256 movq %r15,%rdx movq %r10,%r12 movq %r11,%r13 subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r10 sbbq 24(%rbx),%r11 cmovncq %r14,%rax cmovncq %r15,%rdx cmovncq %r10,%r12 movq %rax,0(%rdi) cmovncq %r11,%r13 movq %rdx,8(%rdi) movq %r12,16(%rdi) movq %r13,24(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_fromx_mont_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_fromx_mont_256: .globl redcx_mont_256 .def redcx_mont_256; .scl 2; .type 32; .endef .p2align 5 redcx_mont_256: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redcx_mont_256: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx redc_mont_256$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_redcx_mont_256: movq %rdx,%rbx call __mulx_by_1_mont_256 addq 32(%rsi),%r14 adcq 40(%rsi),%r15 movq %r14,%rax adcq 48(%rsi),%r10 movq %r15,%rdx adcq 56(%rsi),%r11 sbbq %rsi,%rsi movq %r10,%r12 subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r10 movq %r11,%r13 sbbq 24(%rbx),%r11 sbbq $0,%rsi cmovncq %r14,%rax cmovncq %r15,%rdx cmovncq %r10,%r12 movq %rax,0(%rdi) cmovncq %r11,%r13 movq %rdx,8(%rdi) movq %r12,16(%rdi) movq %r13,24(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_redcx_mont_256: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_redcx_mont_256: .def __mulx_by_1_mont_256; .scl 3; .type 32; .endef .p2align 5 __mulx_by_1_mont_256: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rax movq 8(%rsi),%r11 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq %rax,%r14 imulq %rcx,%rax movq %rax,%r10 mulq 0(%rbx) addq %rax,%r14 movq %r10,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) movq %r11,%r15 imulq %rcx,%r11 addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r11,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) movq %r12,%r10 imulq %rcx,%r12 addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r10 movq %r12,%rax adcq %rdx,%r10 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rbx) movq %r13,%r11 imulq %rcx,%r13 addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 0(%rbx) addq %rax,%r11 movq %r13,%rax adcq %rdx,%r11 mulq 8(%rbx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rbx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_mulx_mont_sparse_256 .rva .LSEH_body_mulx_mont_sparse_256 .rva .LSEH_info_mulx_mont_sparse_256_prologue .rva .LSEH_body_mulx_mont_sparse_256 .rva .LSEH_epilogue_mulx_mont_sparse_256 .rva .LSEH_info_mulx_mont_sparse_256_body .rva .LSEH_epilogue_mulx_mont_sparse_256 .rva .LSEH_end_mulx_mont_sparse_256 .rva .LSEH_info_mulx_mont_sparse_256_epilogue .rva .LSEH_begin_sqrx_mont_sparse_256 .rva .LSEH_body_sqrx_mont_sparse_256 .rva .LSEH_info_sqrx_mont_sparse_256_prologue .rva .LSEH_body_sqrx_mont_sparse_256 .rva .LSEH_epilogue_sqrx_mont_sparse_256 .rva .LSEH_info_sqrx_mont_sparse_256_body .rva .LSEH_epilogue_sqrx_mont_sparse_256 .rva .LSEH_end_sqrx_mont_sparse_256 .rva .LSEH_info_sqrx_mont_sparse_256_epilogue .rva .LSEH_begin_fromx_mont_256 .rva .LSEH_body_fromx_mont_256 .rva .LSEH_info_fromx_mont_256_prologue .rva .LSEH_body_fromx_mont_256 .rva .LSEH_epilogue_fromx_mont_256 .rva .LSEH_info_fromx_mont_256_body .rva .LSEH_epilogue_fromx_mont_256 .rva .LSEH_end_fromx_mont_256 .rva .LSEH_info_fromx_mont_256_epilogue .rva .LSEH_begin_redcx_mont_256 .rva .LSEH_body_redcx_mont_256 .rva .LSEH_info_redcx_mont_256_prologue .rva .LSEH_body_redcx_mont_256 .rva .LSEH_epilogue_redcx_mont_256 .rva .LSEH_info_redcx_mont_256_body .rva .LSEH_epilogue_redcx_mont_256 .rva .LSEH_end_redcx_mont_256 .rva .LSEH_info_redcx_mont_256_epilogue .section .xdata .p2align 3 .LSEH_info_mulx_mont_sparse_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mulx_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_sparse_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_fromx_mont_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_fromx_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_fromx_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redcx_mont_256_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_redcx_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redcx_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/mulx_mont_384-x86_64.s ================================================ .text .def __subx_mod_384x384; .scl 3; .type 32; .endef .p2align 5 __subx_mod_384x384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __addx_mod_384; .scl 3; .type 32; .endef .p2align 5 __addx_mod_384: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __subx_mod_384; .scl 3; .type 32; .endef .p2align 5 __subx_mod_384: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __subx_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl mulx_mont_384x .def mulx_mont_384x; .scl 2; .type 32; .endef .p2align 5 mulx_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_mont_384x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 mul_mont_384x$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $328,%rsp .LSEH_body_mulx_mont_384x: movq %rdx,%rbx movq %rdi,32(%rsp) movq %rsi,24(%rsp) movq %rdx,16(%rsp) movq %rcx,8(%rsp) movq %r8,0(%rsp) leaq 40(%rsp),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 leaq 48(%rbx),%rbx leaq 128+48(%rsi),%rsi leaq 96(%rdi),%rdi call __mulx_384 movq 8(%rsp),%rcx leaq (%rbx),%rsi leaq -48(%rbx),%rdx leaq 40+192+48(%rsp),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi call __mulx_384 leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx call __subx_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi call __subx_mod_384x384 leaq (%rcx),%rbx leaq 40(%rsp),%rsi movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulx_by_1_mont_384 call __redx_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulx_by_1_mont_384 call __redx_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_mulx_mont_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mulx_mont_384x: .globl sqrx_mont_384x .def sqrx_mont_384x; .scl 2; .type 32; .endef .p2align 5 sqrx_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_384x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx sqr_mont_384x$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_sqrx_mont_384x: movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,16(%rsp) movq %rsi,24(%rsp) leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi call __subx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rsi),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 addq %rdx,%rdx adcq %r15,%r15 adcq %rax,%rax movq %rdx,%r8 adcq %r12,%r12 movq %r15,%r9 adcq %rdi,%rdi movq %rax,%r10 adcq %rbp,%rbp movq %r12,%r11 sbbq %rsi,%rsi subq 0(%rcx),%rdx sbbq 8(%rcx),%r15 movq %rdi,%r13 sbbq 16(%rcx),%rax sbbq 24(%rcx),%r12 sbbq 32(%rcx),%rdi movq %rbp,%r14 sbbq 40(%rcx),%rbp sbbq $0,%rsi cmovcq %r8,%rdx cmovcq %r9,%r15 cmovcq %r10,%rax movq %rdx,48(%rbx) cmovcq %r11,%r12 movq %r15,56(%rbx) cmovcq %r13,%rdi movq %rax,64(%rbx) cmovcq %r14,%rbp movq %r12,72(%rbx) movq %rdi,80(%rbx) movq %rbp,88(%rbx) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rdx movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%rax movq 32+24(%rsp),%r12 movq 32+32(%rsp),%rdi movq 32+40(%rsp),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_sqrx_mont_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_mont_384x: .globl mulx_382x .def mulx_382x; .scl 2; .type 32; .endef .p2align 5 mulx_382x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_382x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx mul_382x$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_mulx_382x: leaq 96(%rdi),%rdi movq %rsi,0(%rsp) movq %rdx,8(%rsp) movq %rdi,16(%rsp) movq %rcx,24(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 48(%rsi),%r8 adcq 56(%rsi),%r9 adcq 64(%rsi),%r10 adcq 72(%rsi),%r11 adcq 80(%rsi),%r12 adcq 88(%rsi),%r13 movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq 32(%rdx),%r12 movq 40(%rdx),%r13 addq 48(%rdx),%r8 adcq 56(%rdx),%r9 adcq 64(%rdx),%r10 adcq 72(%rdx),%r11 adcq 80(%rdx),%r12 adcq 88(%rdx),%r13 movq %r8,32+48(%rsp) movq %r9,32+56(%rsp) movq %r10,32+64(%rsp) movq %r11,32+72(%rsp) movq %r12,32+80(%rsp) movq %r13,32+88(%rsp) leaq 32+0(%rsp),%rsi leaq 32+48(%rsp),%rbx call __mulx_384 movq 0(%rsp),%rsi movq 8(%rsp),%rbx leaq -96(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 leaq 48+128(%rsi),%rsi leaq 48(%rbx),%rbx leaq 32(%rsp),%rdi call __mulx_384 movq 16(%rsp),%rsi leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx call __subx_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi call __subx_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_mulx_382x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mulx_382x: .globl sqrx_382x .def sqrx_382x; .scl 2; .type 32; .endef .p2align 5 sqrx_382x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_382x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx sqr_382x$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rsi .LSEH_body_sqrx_382x: movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%rbx movq 32(%rsi),%rbp movq 40(%rsi),%rdx movq %r14,%r8 addq 48(%rsi),%r14 movq %r15,%r9 adcq 56(%rsi),%r15 movq %rax,%r10 adcq 64(%rsi),%rax movq %rbx,%r11 adcq 72(%rsi),%rbx movq %rbp,%r12 adcq 80(%rsi),%rbp movq %rdx,%r13 adcq 88(%rsi),%rdx movq %r14,0(%rdi) movq %r15,8(%rdi) movq %rax,16(%rdi) movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rdx,40(%rdi) leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi call __subx_mod_384_a_is_loaded leaq (%rdi),%rsi leaq -48(%rdi),%rbx leaq -48(%rdi),%rdi call __mulx_384 movq (%rsp),%rsi leaq 48(%rsi),%rbx leaq 96(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq 64(%rdi),%rax movq 72(%rdi),%rbx movq 80(%rdi),%rbp addq %r8,%r8 movq 88(%rdi),%rdx adcq %r9,%r9 movq %r8,0(%rdi) adcq %r10,%r10 movq %r9,8(%rdi) adcq %r11,%r11 movq %r10,16(%rdi) adcq %r12,%r12 movq %r11,24(%rdi) adcq %r13,%r13 movq %r12,32(%rdi) adcq %r14,%r14 movq %r13,40(%rdi) adcq %r15,%r15 movq %r14,48(%rdi) adcq %rax,%rax movq %r15,56(%rdi) adcq %rbx,%rbx movq %rax,64(%rdi) adcq %rbp,%rbp movq %rbx,72(%rdi) adcq %rdx,%rdx movq %rbp,80(%rdi) movq %rdx,88(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sqrx_382x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_382x: .globl mulx_384 .def mulx_384; .scl 2; .type 32; .endef .p2align 5 mulx_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx mul_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 .LSEH_body_mulx_384: movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 movq 0(%rsp),%r15 movq 8(%rsp),%r14 movq 16(%rsp),%r13 movq 24(%rsp),%r12 movq 32(%rsp),%rbx movq 40(%rsp),%rbp leaq 48(%rsp),%rsp .LSEH_epilogue_mulx_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mulx_384: .def __mulx_384; .scl 3; .type 32; .endef .p2align 5 __mulx_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rbx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 leaq -128(%rsi),%rsi mulxq %r14,%r9,%rcx xorq %rbp,%rbp mulxq %r15,%r8,%rax adcxq %rcx,%r8 movq %r9,0(%rdi) mulxq %r10,%r9,%rcx adcxq %rax,%r9 mulxq %r11,%r10,%rax adcxq %rcx,%r10 mulxq %r12,%r11,%rcx adcxq %rax,%r11 mulxq %r13,%r12,%r13 movq 8(%rbx),%rdx adcxq %rcx,%r12 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,8(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 16(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,16(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 24(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,24(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 32(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,32(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 40(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,40(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq %rax,%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 movq %r8,48(%rdi) movq %r9,56(%rdi) movq %r10,64(%rdi) movq %r11,72(%rdi) movq %r12,80(%rdi) movq %r13,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sqrx_384 .def sqrx_384; .scl 2; .type 32; .endef .p2align 5 sqrx_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_384: movq %rcx,%rdi movq %rdx,%rsi sqr_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rdi .LSEH_body_sqrx_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif call __sqrx_384 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sqrx_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_384: .def __sqrx_384; .scl 3; .type 32; .endef .p2align 5 __sqrx_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rdx movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%rcx movq 32(%rsi),%rbx mulxq %r14,%r8,%rdi movq 40(%rsi),%rbp mulxq %r15,%r9,%rax addq %rdi,%r9 mulxq %rcx,%r10,%rdi adcq %rax,%r10 mulxq %rbx,%r11,%rax adcq %rdi,%r11 mulxq %rbp,%r12,%r13 movq %r14,%rdx adcq %rax,%r12 adcq $0,%r13 xorq %r14,%r14 mulxq %r15,%rdi,%rax adcxq %rdi,%r10 adoxq %rax,%r11 mulxq %rcx,%rdi,%rax adcxq %rdi,%r11 adoxq %rax,%r12 mulxq %rbx,%rdi,%rax adcxq %rdi,%r12 adoxq %rax,%r13 mulxq %rbp,%rdi,%rax movq %r15,%rdx adcxq %rdi,%r13 adoxq %r14,%rax adcxq %rax,%r14 xorq %r15,%r15 mulxq %rcx,%rdi,%rax adcxq %rdi,%r12 adoxq %rax,%r13 mulxq %rbx,%rdi,%rax adcxq %rdi,%r13 adoxq %rax,%r14 mulxq %rbp,%rdi,%rax movq %rcx,%rdx adcxq %rdi,%r14 adoxq %r15,%rax adcxq %rax,%r15 xorq %rcx,%rcx mulxq %rbx,%rdi,%rax adcxq %rdi,%r14 adoxq %rax,%r15 mulxq %rbp,%rdi,%rax movq %rbx,%rdx adcxq %rdi,%r15 adoxq %rcx,%rax adcxq %rax,%rcx mulxq %rbp,%rdi,%rbx movq 0(%rsi),%rdx addq %rdi,%rcx movq 8(%rsp),%rdi adcq $0,%rbx xorq %rbp,%rbp adcxq %r8,%r8 adcxq %r9,%r9 adcxq %r10,%r10 adcxq %r11,%r11 adcxq %r12,%r12 mulxq %rdx,%rdx,%rax movq %rdx,0(%rdi) movq 8(%rsi),%rdx adoxq %rax,%r8 movq %r8,8(%rdi) mulxq %rdx,%r8,%rax movq 16(%rsi),%rdx adoxq %r8,%r9 adoxq %rax,%r10 movq %r9,16(%rdi) movq %r10,24(%rdi) mulxq %rdx,%r8,%r9 movq 24(%rsi),%rdx adoxq %r8,%r11 adoxq %r9,%r12 adcxq %r13,%r13 adcxq %r14,%r14 movq %r11,32(%rdi) movq %r12,40(%rdi) mulxq %rdx,%r8,%r9 movq 32(%rsi),%rdx adoxq %r8,%r13 adoxq %r9,%r14 adcxq %r15,%r15 adcxq %rcx,%rcx movq %r13,48(%rdi) movq %r14,56(%rdi) mulxq %rdx,%r8,%r9 movq 40(%rsi),%rdx adoxq %r8,%r15 adoxq %r9,%rcx adcxq %rbx,%rbx adcxq %rbp,%rbp movq %r15,64(%rdi) movq %rcx,72(%rdi) mulxq %rdx,%r8,%r9 adoxq %r8,%rbx adoxq %r9,%rbp movq %rbx,80(%rdi) movq %rbp,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl redcx_mont_384 .def redcx_mont_384; .scl 2; .type 32; .endef .p2align 5 redcx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redcx_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx redc_mont_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_redcx_mont_384: movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 call __redx_tail_mont_384 movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_redcx_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_redcx_mont_384: .globl fromx_mont_384 .def fromx_mont_384; .scl 2; .type 32; .endef .p2align 5 fromx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_fromx_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx from_mont_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_fromx_mont_384: movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 movq %r14,%rax movq %r15,%rcx movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_fromx_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_fromx_mont_384: .def __mulx_by_1_mont_384; .scl 3; .type 32; .endef .p2align 5 __mulx_by_1_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq %rcx,%rdx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 imulq %r8,%rdx xorq %r14,%r14 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r13 adoxq %r14,%rbp adcxq %rbp,%r14 imulq %r9,%rdx xorq %r15,%r15 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r14 adoxq %r15,%rbp adcxq %rbp,%r15 imulq %r10,%rdx xorq %r8,%r8 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r15 adoxq %r8,%rbp adcxq %rbp,%r8 imulq %r11,%rdx xorq %r9,%r9 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r8 adoxq %r9,%rbp adcxq %rbp,%r9 imulq %r12,%rdx xorq %r10,%r10 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r9 adoxq %r10,%rbp adcxq %rbp,%r10 imulq %r13,%rdx xorq %r11,%r11 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r10 adoxq %r11,%rbp adcxq %rbp,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .def __redx_tail_mont_384; .scl 3; .type 32; .endef .p2align 5 __redx_tail_mont_384: .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 movq %r14,%rax adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 movq %r15,%rcx adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 sbbq %r12,%r12 movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl sgn0x_pty_mont_384 .def sgn0x_pty_mont_384; .scl 2; .type 32; .endef .p2align 5 sgn0x_pty_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0x_pty_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx sgn0_pty_mont_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sgn0x_pty_mont_384: movq %rsi,%rbx leaq 0(%rdi),%rsi movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 xorq %rax,%rax movq %r14,%r13 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax notq %rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sgn0x_pty_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sgn0x_pty_mont_384: .globl sgn0x_pty_mont_384x .def sgn0x_pty_mont_384x; .scl 2; .type 32; .endef .p2align 5 sgn0x_pty_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0x_pty_mont_384x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx sgn0_pty_mont_384x$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $8,%rsp .LSEH_body_sgn0x_pty_mont_384x: movq %rsi,%rbx leaq 48(%rdi),%rsi movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 leaq 0(%rdi),%rsi xorq %rdi,%rdi movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rdi subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rdi movq %r14,0(%rsp) notq %rdi andq $1,%r13 andq $2,%rdi orq %r13,%rdi call __mulx_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 xorq %rax,%rax movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax movq 0(%rsp),%r12 notq %rax testq %r14,%r14 cmovzq %rdi,%r13 testq %r12,%r12 cmovnzq %rdi,%rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 movq 16(%rsp),%r14 movq 24(%rsp),%r13 movq 32(%rsp),%r12 movq 40(%rsp),%rbx movq 48(%rsp),%rbp leaq 56(%rsp),%rsp .LSEH_epilogue_sgn0x_pty_mont_384x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sgn0x_pty_mont_384x: .globl mulx_mont_384 .def mulx_mont_384; .scl 2; .type 32; .endef .p2align 5 mulx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 mul_mont_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 leaq -24(%rsp),%rsp .LSEH_body_mulx_mont_384: movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx movq %r8,(%rsp) mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 24(%rsp),%r15 movq 32(%rsp),%r14 movq 40(%rsp),%r13 movq 48(%rsp),%r12 movq 56(%rsp),%rbx movq 64(%rsp),%rbp leaq 72(%rsp),%rsp .LSEH_epilogue_mulx_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_mulx_mont_384: .def __mulx_mont_384; .scl 3; .type 32; .endef .p2align 5 __mulx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r14,%r10 mulxq %rax,%r15,%r11 addq %r14,%r9 mulxq %r12,%rax,%r12 adcq %r15,%r10 mulxq %rdi,%rdi,%r13 adcq %rax,%r11 mulxq %rbp,%rbp,%r14 movq 8(%rbx),%rdx adcq %rdi,%r12 adcq %rbp,%r13 adcq $0,%r14 xorq %r15,%r15 movq %r8,16(%rsp) imulq 8(%rsp),%r8 xorq %rax,%rax mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r9 adcxq %rbp,%r10 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 40+128(%rsi),%rdi,%rbp movq %r8,%rdx adoxq %rdi,%r14 adcxq %rbp,%r15 adoxq %rax,%r15 adoxq %rax,%rax xorq %r8,%r8 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r9 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 40+128(%rcx),%rdi,%rbp movq 16(%rbx),%rdx adcxq %rdi,%r13 adoxq %rbp,%r14 adcxq %r8,%r14 adoxq %r8,%r15 adcxq %r8,%r15 adoxq %r8,%rax adcxq %r8,%rax movq %r9,16(%rsp) imulq 8(%rsp),%r9 xorq %r8,%r8 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 40+128(%rsi),%rdi,%rbp movq %r9,%rdx adoxq %rdi,%r15 adcxq %rbp,%rax adoxq %r8,%rax adoxq %r8,%r8 xorq %r9,%r9 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r10 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 40+128(%rcx),%rdi,%rbp movq 24(%rbx),%rdx adcxq %rdi,%r14 adoxq %rbp,%r15 adcxq %r9,%r15 adoxq %r9,%rax adcxq %r9,%rax adoxq %r9,%r8 adcxq %r9,%r8 movq %r10,16(%rsp) imulq 8(%rsp),%r10 xorq %r9,%r9 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 40+128(%rsi),%rdi,%rbp movq %r10,%rdx adoxq %rdi,%rax adcxq %rbp,%r8 adoxq %r9,%r8 adoxq %r9,%r9 xorq %r10,%r10 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r11 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 40+128(%rcx),%rdi,%rbp movq 32(%rbx),%rdx adcxq %rdi,%r15 adoxq %rbp,%rax adcxq %r10,%rax adoxq %r10,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcxq %r10,%r9 movq %r11,16(%rsp) imulq 8(%rsp),%r11 xorq %r10,%r10 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 40+128(%rsi),%rdi,%rbp movq %r11,%rdx adoxq %rdi,%r8 adcxq %rbp,%r9 adoxq %r10,%r9 adoxq %r10,%r10 xorq %r11,%r11 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r12 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 40+128(%rcx),%rdi,%rbp movq 40(%rbx),%rdx adcxq %rdi,%rax adoxq %rbp,%r8 adcxq %r11,%r8 adoxq %r11,%r9 adcxq %r11,%r9 adoxq %r11,%r10 adcxq %r11,%r10 movq %r12,16(%rsp) imulq 8(%rsp),%r12 xorq %r11,%r11 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r8 adcxq %rbp,%r9 mulxq 40+128(%rsi),%rdi,%rbp movq %r12,%rdx adoxq %rdi,%r9 adcxq %rbp,%r10 adoxq %r11,%r10 adoxq %r11,%r11 xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r13 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 40+128(%rcx),%rdi,%rbp movq %r13,%rdx adcxq %rdi,%r8 adoxq %rbp,%r9 adcxq %r12,%r9 adoxq %r12,%r10 adcxq %r12,%r10 adoxq %r12,%r11 adcxq %r12,%r11 imulq 8(%rsp),%rdx movq 24(%rsp),%rbx xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 movq %r15,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r9 movq %rax,%rsi mulxq 40+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 movq %r14,%rdx adcxq %r12,%r10 adoxq %r12,%r11 leaq 128(%rcx),%rcx movq %r8,%r12 adcq $0,%r11 subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %r9,%rdi sbbq 16(%rcx),%rax sbbq 24(%rcx),%r8 sbbq 32(%rcx),%r9 movq %r10,%rbp sbbq 40(%rcx),%r10 sbbq $0,%r11 cmovncq %r14,%rdx cmovcq %r13,%r15 cmovcq %rsi,%rax cmovncq %r8,%r12 movq %rdx,0(%rbx) cmovncq %r9,%rdi movq %r15,8(%rbx) cmovncq %r10,%rbp movq %rax,16(%rbx) movq %r12,24(%rbx) movq %rdi,32(%rbx) movq %rbp,40(%rbx) #ifdef __SGX_LVI_HARDENING__ popq %rsi lfence jmpq *%rsi ud2 #else .byte 0xf3,0xc3 #endif .globl sqrx_mont_384 .def sqrx_mont_384; .scl 2; .type 32; .endef .p2align 5 sqrx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx sqr_mont_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 leaq -24(%rsp),%rsp .LSEH_body_sqrx_mont_384: movq %rcx,%r8 leaq -128(%rdx),%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq (%rsi),%rbx movq %r8,(%rsp) leaq -128(%rsi),%rsi mulxq %rdx,%r8,%r9 call __mulx_mont_384 movq 24(%rsp),%r15 movq 32(%rsp),%r14 movq 40(%rsp),%r13 movq 48(%rsp),%r12 movq 56(%rsp),%rbx movq 64(%rsp),%rbp leaq 72(%rsp),%rsp .LSEH_epilogue_sqrx_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_mont_384: .globl sqrx_n_mul_mont_384 .def sqrx_n_mul_mont_384; .scl 2; .type 32; .endef .p2align 5 sqrx_n_mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_n_mul_mont_384: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 sqr_n_mul_mont_384$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 leaq -40(%rsp),%rsp .LSEH_body_sqrx_n_mul_mont_384: movq %rdx,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq %rsi,%rbx movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp movq %r8,(%rsp) movq %r9,24(%rsp) movq 0(%r9),%xmm2 .Loop_sqrx_384: movd %r10d,%xmm1 leaq -128(%rbx),%rsi leaq -128(%rcx),%rcx mulxq %rdx,%r8,%r9 call __mulx_mont_384 movd %xmm1,%r10d decl %r10d jnz .Loop_sqrx_384 movq %rdx,%r14 .byte 102,72,15,126,210 leaq -128(%rbx),%rsi movq 24(%rsp),%rbx leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 40(%rsp),%r15 movq 48(%rsp),%r14 movq 56(%rsp),%r13 movq 64(%rsp),%r12 movq 72(%rsp),%rbx movq 80(%rsp),%rbp leaq 88(%rsp),%rsp .LSEH_epilogue_sqrx_n_mul_mont_384: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_n_mul_mont_384: .globl sqrx_n_mul_mont_383 .def sqrx_n_mul_mont_383; .scl 2; .type 32; .endef .p2align 5 sqrx_n_mul_mont_383: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_n_mul_mont_383: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 sqr_n_mul_mont_383$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 leaq -40(%rsp),%rsp .LSEH_body_sqrx_n_mul_mont_383: movq %rdx,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq %rsi,%rbx movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp movq %r8,(%rsp) movq %r9,24(%rsp) movq 0(%r9),%xmm2 leaq -128(%rcx),%rcx .Loop_sqrx_383: movd %r10d,%xmm1 leaq -128(%rbx),%rsi mulxq %rdx,%r8,%r9 call __mulx_mont_383_nonred movd %xmm1,%r10d decl %r10d jnz .Loop_sqrx_383 movq %rdx,%r14 .byte 102,72,15,126,210 leaq -128(%rbx),%rsi movq 24(%rsp),%rbx mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 40(%rsp),%r15 movq 48(%rsp),%r14 movq 56(%rsp),%r13 movq 64(%rsp),%r12 movq 72(%rsp),%rbx movq 80(%rsp),%rbp leaq 88(%rsp),%rsp .LSEH_epilogue_sqrx_n_mul_mont_383: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_n_mul_mont_383: .def __mulx_mont_383_nonred; .scl 3; .type 32; .endef .p2align 5 __mulx_mont_383_nonred: .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r14,%r10 mulxq %rax,%r15,%r11 addq %r14,%r9 mulxq %r12,%rax,%r12 adcq %r15,%r10 mulxq %rdi,%rdi,%r13 adcq %rax,%r11 mulxq %rbp,%rbp,%r14 movq 8(%rbx),%rdx adcq %rdi,%r12 adcq %rbp,%r13 adcq $0,%r14 movq %r8,%rax imulq 8(%rsp),%r8 xorq %r15,%r15 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r9 adcxq %rbp,%r10 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 40+128(%rsi),%rdi,%rbp movq %r8,%rdx adoxq %rdi,%r14 adcxq %r15,%rbp adoxq %rbp,%r15 xorq %r8,%r8 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r9 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 40+128(%rcx),%rdi,%rbp movq 16(%rbx),%rdx adcxq %rdi,%r13 adoxq %rbp,%r14 adcxq %rax,%r14 adoxq %rax,%r15 adcxq %rax,%r15 movq %r9,%r8 imulq 8(%rsp),%r9 xorq %rax,%rax mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 40+128(%rsi),%rdi,%rbp movq %r9,%rdx adoxq %rdi,%r15 adcxq %rax,%rbp adoxq %rbp,%rax xorq %r9,%r9 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r10 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 40+128(%rcx),%rdi,%rbp movq 24(%rbx),%rdx adcxq %rdi,%r14 adoxq %rbp,%r15 adcxq %r8,%r15 adoxq %r8,%rax adcxq %r8,%rax movq %r10,%r9 imulq 8(%rsp),%r10 xorq %r8,%r8 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 40+128(%rsi),%rdi,%rbp movq %r10,%rdx adoxq %rdi,%rax adcxq %r8,%rbp adoxq %rbp,%r8 xorq %r10,%r10 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r11 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 40+128(%rcx),%rdi,%rbp movq 32(%rbx),%rdx adcxq %rdi,%r15 adoxq %rbp,%rax adcxq %r9,%rax adoxq %r9,%r8 adcxq %r9,%r8 movq %r11,%r10 imulq 8(%rsp),%r11 xorq %r9,%r9 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 40+128(%rsi),%rdi,%rbp movq %r11,%rdx adoxq %rdi,%r8 adcxq %r9,%rbp adoxq %rbp,%r9 xorq %r11,%r11 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r12 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 40+128(%rcx),%rdi,%rbp movq 40(%rbx),%rdx adcxq %rdi,%rax adoxq %rbp,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcxq %r10,%r9 movq %r12,%r11 imulq 8(%rsp),%r12 xorq %r10,%r10 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r8 adcxq %rbp,%r9 mulxq 40+128(%rsi),%rdi,%rbp movq %r12,%rdx adoxq %rdi,%r9 adcxq %r10,%rbp adoxq %rbp,%r10 xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r13 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 40+128(%rcx),%rdi,%rbp movq %r13,%rdx adcxq %rdi,%r8 adoxq %rbp,%r9 adcxq %r11,%r9 adoxq %r11,%r10 adcxq %r11,%r10 imulq 8(%rsp),%rdx movq 24(%rsp),%rbx xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r9 mulxq 40+128(%rcx),%rdi,%rbp movq %r14,%rdx adcxq %rdi,%r9 adoxq %rbp,%r10 adcq $0,%r10 movq %r8,%r12 movq %r14,0(%rbx) movq %r15,8(%rbx) movq %rax,16(%rbx) movq %r9,%rdi movq %r8,24(%rbx) movq %r9,32(%rbx) movq %r10,40(%rbx) movq %r10,%rbp #ifdef __SGX_LVI_HARDENING__ popq %rsi lfence jmpq *%rsi ud2 #else .byte 0xf3,0xc3 #endif .globl sqrx_mont_382x .def sqrx_mont_382x; .scl 2; .type 32; .endef .p2align 5 sqrx_mont_382x: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_382x: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx sqr_mont_382x$1: pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $136,%rsp .LSEH_body_sqrx_mont_382x: movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,16(%rsp) movq %rsi,24(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rdx adcq 72(%rsi),%r11 movq %r12,%rbx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rdx sbbq 80(%rsi),%rbx sbbq 88(%rsi),%rbp sbbq %rdi,%rdi movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq %r14,32+48(%rsp) movq %r15,32+56(%rsp) movq %rax,32+64(%rsp) movq %rdx,32+72(%rsp) movq %rbx,32+80(%rsp) movq %rbp,32+88(%rsp) movq %rdi,32+96(%rsp) leaq 48(%rsi),%rbx movq 48(%rsi),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_383_nonred addq %rdx,%rdx adcq %r15,%r15 adcq %rax,%rax adcq %r12,%r12 adcq %rdi,%rdi adcq %rbp,%rbp movq %rdx,48(%rbx) movq %r15,56(%rbx) movq %rax,64(%rbx) movq %r12,72(%rbx) movq %rdi,80(%rbx) movq %rbp,88(%rbx) leaq 32-128(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rdx movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%rax movq 32+24(%rsp),%r12 movq 32+32(%rsp),%rdi movq 32+40(%rsp),%rbp mulxq %r14,%r8,%r9 call __mulx_mont_383_nonred movq 32+96(%rsp),%r14 leaq 128(%rcx),%rcx movq 32+0(%rsp),%r8 andq %r14,%r8 movq 32+8(%rsp),%r9 andq %r14,%r9 movq 32+16(%rsp),%r10 andq %r14,%r10 movq 32+24(%rsp),%r11 andq %r14,%r11 movq 32+32(%rsp),%r13 andq %r14,%r13 andq 32+40(%rsp),%r14 subq %r8,%rdx movq 0(%rcx),%r8 sbbq %r9,%r15 movq 8(%rcx),%r9 sbbq %r10,%rax movq 16(%rcx),%r10 sbbq %r11,%r12 movq 24(%rcx),%r11 sbbq %r13,%rdi movq 32(%rcx),%r13 sbbq %r14,%rbp sbbq %r14,%r14 andq %r14,%r8 andq %r14,%r9 andq %r14,%r10 andq %r14,%r11 andq %r14,%r13 andq 40(%rcx),%r14 addq %r8,%rdx adcq %r9,%r15 adcq %r10,%rax adcq %r11,%r12 adcq %r13,%rdi adcq %r14,%rbp movq %rdx,0(%rbx) movq %r15,8(%rbx) movq %rax,16(%rbx) movq %r12,24(%rbx) movq %rdi,32(%rbx) movq %rbp,40(%rbx) leaq 136(%rsp),%r8 movq 0(%r8),%r15 movq 8(%r8),%r14 movq 16(%r8),%r13 movq 24(%r8),%r12 movq 32(%r8),%rbx movq 40(%r8),%rbp leaq 48(%r8),%rsp .LSEH_epilogue_sqrx_mont_382x: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_sqrx_mont_382x: .section .pdata .p2align 2 .rva .LSEH_begin_mulx_mont_384x .rva .LSEH_body_mulx_mont_384x .rva .LSEH_info_mulx_mont_384x_prologue .rva .LSEH_body_mulx_mont_384x .rva .LSEH_epilogue_mulx_mont_384x .rva .LSEH_info_mulx_mont_384x_body .rva .LSEH_epilogue_mulx_mont_384x .rva .LSEH_end_mulx_mont_384x .rva .LSEH_info_mulx_mont_384x_epilogue .rva .LSEH_begin_sqrx_mont_384x .rva .LSEH_body_sqrx_mont_384x .rva .LSEH_info_sqrx_mont_384x_prologue .rva .LSEH_body_sqrx_mont_384x .rva .LSEH_epilogue_sqrx_mont_384x .rva .LSEH_info_sqrx_mont_384x_body .rva .LSEH_epilogue_sqrx_mont_384x .rva .LSEH_end_sqrx_mont_384x .rva .LSEH_info_sqrx_mont_384x_epilogue .rva .LSEH_begin_mulx_382x .rva .LSEH_body_mulx_382x .rva .LSEH_info_mulx_382x_prologue .rva .LSEH_body_mulx_382x .rva .LSEH_epilogue_mulx_382x .rva .LSEH_info_mulx_382x_body .rva .LSEH_epilogue_mulx_382x .rva .LSEH_end_mulx_382x .rva .LSEH_info_mulx_382x_epilogue .rva .LSEH_begin_sqrx_382x .rva .LSEH_body_sqrx_382x .rva .LSEH_info_sqrx_382x_prologue .rva .LSEH_body_sqrx_382x .rva .LSEH_epilogue_sqrx_382x .rva .LSEH_info_sqrx_382x_body .rva .LSEH_epilogue_sqrx_382x .rva .LSEH_end_sqrx_382x .rva .LSEH_info_sqrx_382x_epilogue .rva .LSEH_begin_mulx_384 .rva .LSEH_body_mulx_384 .rva .LSEH_info_mulx_384_prologue .rva .LSEH_body_mulx_384 .rva .LSEH_epilogue_mulx_384 .rva .LSEH_info_mulx_384_body .rva .LSEH_epilogue_mulx_384 .rva .LSEH_end_mulx_384 .rva .LSEH_info_mulx_384_epilogue .rva .LSEH_begin_sqrx_384 .rva .LSEH_body_sqrx_384 .rva .LSEH_info_sqrx_384_prologue .rva .LSEH_body_sqrx_384 .rva .LSEH_epilogue_sqrx_384 .rva .LSEH_info_sqrx_384_body .rva .LSEH_epilogue_sqrx_384 .rva .LSEH_end_sqrx_384 .rva .LSEH_info_sqrx_384_epilogue .rva .LSEH_begin_redcx_mont_384 .rva .LSEH_body_redcx_mont_384 .rva .LSEH_info_redcx_mont_384_prologue .rva .LSEH_body_redcx_mont_384 .rva .LSEH_epilogue_redcx_mont_384 .rva .LSEH_info_redcx_mont_384_body .rva .LSEH_epilogue_redcx_mont_384 .rva .LSEH_end_redcx_mont_384 .rva .LSEH_info_redcx_mont_384_epilogue .rva .LSEH_begin_fromx_mont_384 .rva .LSEH_body_fromx_mont_384 .rva .LSEH_info_fromx_mont_384_prologue .rva .LSEH_body_fromx_mont_384 .rva .LSEH_epilogue_fromx_mont_384 .rva .LSEH_info_fromx_mont_384_body .rva .LSEH_epilogue_fromx_mont_384 .rva .LSEH_end_fromx_mont_384 .rva .LSEH_info_fromx_mont_384_epilogue .rva .LSEH_begin_sgn0x_pty_mont_384 .rva .LSEH_body_sgn0x_pty_mont_384 .rva .LSEH_info_sgn0x_pty_mont_384_prologue .rva .LSEH_body_sgn0x_pty_mont_384 .rva .LSEH_epilogue_sgn0x_pty_mont_384 .rva .LSEH_info_sgn0x_pty_mont_384_body .rva .LSEH_epilogue_sgn0x_pty_mont_384 .rva .LSEH_end_sgn0x_pty_mont_384 .rva .LSEH_info_sgn0x_pty_mont_384_epilogue .rva .LSEH_begin_sgn0x_pty_mont_384x .rva .LSEH_body_sgn0x_pty_mont_384x .rva .LSEH_info_sgn0x_pty_mont_384x_prologue .rva .LSEH_body_sgn0x_pty_mont_384x .rva .LSEH_epilogue_sgn0x_pty_mont_384x .rva .LSEH_info_sgn0x_pty_mont_384x_body .rva .LSEH_epilogue_sgn0x_pty_mont_384x .rva .LSEH_end_sgn0x_pty_mont_384x .rva .LSEH_info_sgn0x_pty_mont_384x_epilogue .rva .LSEH_begin_mulx_mont_384 .rva .LSEH_body_mulx_mont_384 .rva .LSEH_info_mulx_mont_384_prologue .rva .LSEH_body_mulx_mont_384 .rva .LSEH_epilogue_mulx_mont_384 .rva .LSEH_info_mulx_mont_384_body .rva .LSEH_epilogue_mulx_mont_384 .rva .LSEH_end_mulx_mont_384 .rva .LSEH_info_mulx_mont_384_epilogue .rva .LSEH_begin_sqrx_mont_384 .rva .LSEH_body_sqrx_mont_384 .rva .LSEH_info_sqrx_mont_384_prologue .rva .LSEH_body_sqrx_mont_384 .rva .LSEH_epilogue_sqrx_mont_384 .rva .LSEH_info_sqrx_mont_384_body .rva .LSEH_epilogue_sqrx_mont_384 .rva .LSEH_end_sqrx_mont_384 .rva .LSEH_info_sqrx_mont_384_epilogue .rva .LSEH_begin_sqrx_n_mul_mont_384 .rva .LSEH_body_sqrx_n_mul_mont_384 .rva .LSEH_info_sqrx_n_mul_mont_384_prologue .rva .LSEH_body_sqrx_n_mul_mont_384 .rva .LSEH_epilogue_sqrx_n_mul_mont_384 .rva .LSEH_info_sqrx_n_mul_mont_384_body .rva .LSEH_epilogue_sqrx_n_mul_mont_384 .rva .LSEH_end_sqrx_n_mul_mont_384 .rva .LSEH_info_sqrx_n_mul_mont_384_epilogue .rva .LSEH_begin_sqrx_n_mul_mont_383 .rva .LSEH_body_sqrx_n_mul_mont_383 .rva .LSEH_info_sqrx_n_mul_mont_383_prologue .rva .LSEH_body_sqrx_n_mul_mont_383 .rva .LSEH_epilogue_sqrx_n_mul_mont_383 .rva .LSEH_info_sqrx_n_mul_mont_383_body .rva .LSEH_epilogue_sqrx_n_mul_mont_383 .rva .LSEH_end_sqrx_n_mul_mont_383 .rva .LSEH_info_sqrx_n_mul_mont_383_epilogue .rva .LSEH_begin_sqrx_mont_382x .rva .LSEH_body_sqrx_mont_382x .rva .LSEH_info_sqrx_mont_382x_prologue .rva .LSEH_body_sqrx_mont_382x .rva .LSEH_epilogue_sqrx_mont_382x .rva .LSEH_info_sqrx_mont_382x_body .rva .LSEH_epilogue_sqrx_mont_382x .rva .LSEH_end_sqrx_mont_382x .rva .LSEH_info_sqrx_mont_382x_epilogue .section .xdata .p2align 3 .LSEH_info_mulx_mont_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mulx_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x29,0x00 .byte 0x00,0xe4,0x2a,0x00 .byte 0x00,0xd4,0x2b,0x00 .byte 0x00,0xc4,0x2c,0x00 .byte 0x00,0x34,0x2d,0x00 .byte 0x00,0x54,0x2e,0x00 .byte 0x00,0x74,0x30,0x00 .byte 0x00,0x64,0x31,0x00 .byte 0x00,0x01,0x2f,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_382x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mulx_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_382x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_382x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mulx_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x00,0x00 .byte 0x00,0xe4,0x01,0x00 .byte 0x00,0xd4,0x02,0x00 .byte 0x00,0xc4,0x03,0x00 .byte 0x00,0x34,0x04,0x00 .byte 0x00,0x54,0x05,0x00 .byte 0x00,0x74,0x07,0x00 .byte 0x00,0x64,0x08,0x00 .byte 0x00,0x52 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redcx_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_redcx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_redcx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_fromx_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_fromx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_fromx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0x_pty_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sgn0x_pty_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0x_pty_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0x_pty_mont_384x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sgn0x_pty_mont_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 .byte 0x00,0xe4,0x02,0x00 .byte 0x00,0xd4,0x03,0x00 .byte 0x00,0xc4,0x04,0x00 .byte 0x00,0x34,0x05,0x00 .byte 0x00,0x54,0x06,0x00 .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0x_pty_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_mulx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 .byte 0x00,0xe4,0x04,0x00 .byte 0x00,0xd4,0x05,0x00 .byte 0x00,0xc4,0x06,0x00 .byte 0x00,0x34,0x07,0x00 .byte 0x00,0x54,0x08,0x00 .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 .byte 0x00,0xe4,0x04,0x00 .byte 0x00,0xd4,0x05,0x00 .byte 0x00,0xc4,0x06,0x00 .byte 0x00,0x34,0x07,0x00 .byte 0x00,0x54,0x08,0x00 .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_n_mul_mont_384_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_n_mul_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x05,0x00 .byte 0x00,0xe4,0x06,0x00 .byte 0x00,0xd4,0x07,0x00 .byte 0x00,0xc4,0x08,0x00 .byte 0x00,0x34,0x09,0x00 .byte 0x00,0x54,0x0a,0x00 .byte 0x00,0x74,0x0c,0x00 .byte 0x00,0x64,0x0d,0x00 .byte 0x00,0xa2 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_n_mul_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_n_mul_mont_383_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_n_mul_mont_383_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x05,0x00 .byte 0x00,0xe4,0x06,0x00 .byte 0x00,0xd4,0x07,0x00 .byte 0x00,0xc4,0x08,0x00 .byte 0x00,0x34,0x09,0x00 .byte 0x00,0x54,0x0a,0x00 .byte 0x00,0x74,0x0c,0x00 .byte 0x00,0x64,0x0d,0x00 .byte 0x00,0xa2 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_n_mul_mont_383_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_382x_prologue: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 .byte 0,0xb3 .byte 0,0 .long 0,0 .LSEH_info_sqrx_mont_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 .byte 0x00,0xe4,0x12,0x00 .byte 0x00,0xd4,0x13,0x00 .byte 0x00,0xc4,0x14,0x00 .byte 0x00,0x34,0x15,0x00 .byte 0x00,0x54,0x16,0x00 .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/sha256-armv8.S ================================================ // // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 // // ==================================================================== // Written by Andy Polyakov, @dot-asm, initially for the OpenSSL // project. // ==================================================================== // // sha256_block procedure for ARMv8. // // This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. .comm __blst_platform_cap,4 .text .p2align 6 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 .p2align 2 .globl blst_sha256_block_armv8 .def blst_sha256_block_armv8; .type 32; .endef .p2align 6 blst_sha256_block_armv8: hint #34 .Lv8_entry: stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] adr x3,.LK256 .Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 ld1 {v16.4s},[x3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,.Loop_hw st1 {v0.4s,v1.4s},[x0] ldr x29,[sp],#2*__SIZEOF_POINTER__ ret .globl blst_sha256_block_data_order .def blst_sha256_block_data_order; .type 32; .endef .p2align 4 blst_sha256_block_data_order: hint #34 adrp x16,__blst_platform_cap ldr w16,[x16,#:lo12:__blst_platform_cap] tst w16,#1 b.ne .Lv8_entry stp x29, x30, [sp, #-2*__SIZEOF_POINTER__]! mov x29, sp sub sp,sp,#16*4 adr x16,.LK256 add x2,x1,x2,lsl#6 // len to point at the end of inp ld1 {v0.16b},[x1], #16 ld1 {v1.16b},[x1], #16 ld1 {v2.16b},[x1], #16 ld1 {v3.16b},[x1], #16 ld1 {v4.4s},[x16], #16 ld1 {v5.4s},[x16], #16 ld1 {v6.4s},[x16], #16 ld1 {v7.4s},[x16], #16 rev32 v0.16b,v0.16b // yes, even on rev32 v1.16b,v1.16b // big-endian rev32 v2.16b,v2.16b rev32 v3.16b,v3.16b mov x17,sp add v4.4s,v4.4s,v0.4s add v5.4s,v5.4s,v1.4s add v6.4s,v6.4s,v2.4s st1 {v4.4s,v5.4s},[x17], #32 add v7.4s,v7.4s,v3.4s st1 {v6.4s,v7.4s},[x17] sub x17,x17,#32 ldp w3,w4,[x0] ldp w5,w6,[x0,#8] ldp w7,w8,[x0,#16] ldp w9,w10,[x0,#24] ldr w12,[sp,#0] mov w13,wzr eor w14,w4,w5 mov w15,wzr b .L_00_48 .p2align 4 .L_00_48: ext v4.16b,v0.16b,v1.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v2.16b,v3.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v3.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v0.4s,v0.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v0.4s,v0.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v0.4s,v0.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v0.4s,#17 orr w12,w12,w15 ushr v19.4s,v0.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v0.4s,#15 add w8,w8,w12 ushr v17.4s,v0.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v0.4s,#13 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v0.4s,v0.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v0.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext v4.16b,v1.16b,v2.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v3.16b,v0.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v0.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v1.4s,v1.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v1.4s,v1.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v1.4s,v1.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v1.4s,#17 orr w12,w12,w15 ushr v19.4s,v1.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v1.4s,#15 add w4,w4,w12 ushr v17.4s,v1.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v1.4s,#13 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v1.4s,v1.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v1.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 ext v4.16b,v2.16b,v3.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v0.16b,v1.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v1.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v2.4s,v2.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v2.4s,v2.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v2.4s,v2.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v2.4s,#17 orr w12,w12,w15 ushr v19.4s,v2.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v2.4s,#15 add w8,w8,w12 ushr v17.4s,v2.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v2.4s,#13 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v2.4s,v2.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v2.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext v4.16b,v3.16b,v0.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v1.16b,v2.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v2.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v3.4s,v3.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v3.4s,v3.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v3.4s,v3.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v3.4s,#17 orr w12,w12,w15 ushr v19.4s,v3.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v3.4s,#15 add w4,w4,w12 ushr v17.4s,v3.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v3.4s,#13 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v3.4s,v3.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v3.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[x16] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 cmp w12,#0 // check for K256 terminator ldr w12,[sp,#0] sub x17,x17,#64 bne .L_00_48 sub x16,x16,#256 cmp x1,x2 mov x17, #-64 csel x17, x17, xzr, eq add x1,x1,x17 mov x17,sp add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v0.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v0.16b,v0.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v0.4s add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v1.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v1.16b,v1.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v1.4s add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v2.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v2.16b,v2.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v2.4s add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v3.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v3.16b,v3.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v3.4s add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w3,w3,w15 // h+=Sigma0(a) from the past ldp w11,w12,[x0,#0] add w3,w3,w13 // h+=Maj(a,b,c) from the past ldp w13,w14,[x0,#8] add w3,w3,w11 // accumulate add w4,w4,w12 ldp w11,w12,[x0,#16] add w5,w5,w13 add w6,w6,w14 ldp w13,w14,[x0,#24] add w7,w7,w11 add w8,w8,w12 ldr w12,[sp,#0] stp w3,w4,[x0,#0] add w9,w9,w13 mov w13,wzr stp w5,w6,[x0,#8] add w10,w10,w14 stp w7,w8,[x0,#16] eor w14,w4,w5 stp w9,w10,[x0,#24] mov w15,wzr mov x17,sp b.ne .L_00_48 ldr x29,[x29] add sp,sp,#16*4+2*__SIZEOF_POINTER__ ret .globl blst_sha256_emit .def blst_sha256_emit; .type 32; .endef .p2align 4 blst_sha256_emit: hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] #ifndef __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 #endif str w4,[x0,#4] lsr x4,x4,#32 str w5,[x0,#12] lsr x5,x5,#32 str w6,[x0,#20] lsr x6,x6,#32 str w7,[x0,#28] lsr x7,x7,#32 str w4,[x0,#0] str w5,[x0,#8] str w6,[x0,#16] str w7,[x0,#24] ret .globl blst_sha256_bcopy .def blst_sha256_bcopy; .type 32; .endef .p2align 4 blst_sha256_bcopy: hint #34 .Loop_bcopy: ldrb w3,[x1],#1 sub x2,x2,#1 strb w3,[x0],#1 cbnz x2,.Loop_bcopy ret .globl blst_sha256_hcopy .def blst_sha256_hcopy; .type 32; .endef .p2align 4 blst_sha256_hcopy: hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] stp x4,x5,[x0] stp x6,x7,[x0,#16] ret ================================================ FILE: build/coff/sha256-portable-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .globl blst_sha256_block_data_order .def blst_sha256_block_data_order; .scl 2; .type 32; .endef .p2align 4 blst_sha256_block_data_order: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_blst_sha256_block_data_order: pushq %rbp movq %rsp,%rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx #ifdef __BLST_PORTABLE__ testl $2,__blst_platform_cap(%rip) jnz .Lblst_sha256_block_data_order$2 #endif pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 shlq $4,%rdx subq $64+24,%rsp .LSEH_body_blst_sha256_block_data_order: leaq (%rsi,%rdx,4),%rdx movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop .p2align 4 .Lloop: movl %ebx,%edi leaq K256(%rip),%rbp xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 0(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 4(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 8(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 12(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 16(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 20(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 24(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 28(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 32(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 36(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 40(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 44(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 48(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 52(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 56(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 60(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax jmp .Lrounds_16_xx .p2align 4 .Lrounds_16_xx: movl 4(%rsp),%r13d movl 56(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 64(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d movl 8(%rsp),%r13d movl 60(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 68(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d movl 12(%rsp),%r13d movl 0(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 72(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d movl 16(%rsp),%r13d movl 4(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 76(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d movl 20(%rsp),%r13d movl 8(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 80(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx movl 24(%rsp),%r13d movl 12(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 84(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx movl 28(%rsp),%r13d movl 16(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 88(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx movl 32(%rsp),%r13d movl 20(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 92(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax movl 36(%rsp),%r13d movl 24(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 96(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d movl 40(%rsp),%r13d movl 28(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 100(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d movl 44(%rsp),%r13d movl 32(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 104(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d movl 48(%rsp),%r13d movl 36(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 108(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d movl 52(%rsp),%r13d movl 40(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 112(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx movl 56(%rsp),%r13d movl 44(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 116(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx movl 60(%rsp),%r13d movl 48(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 120(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx movl 0(%rsp),%r13d movl 52(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 124(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 64(%rbp),%rbp cmpb $0x19,3(%rbp) jnz .Lrounds_16_xx movq 64+0(%rsp),%rdi addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop leaq 64+24+48(%rsp),%r11 movq 64+24(%rsp),%r15 movq -40(%r11),%r14 movq -32(%r11),%r13 movq -24(%r11),%r12 movq -16(%r11),%rbx movq -8(%r11),%rbp .LSEH_epilogue_blst_sha256_block_data_order: mov 8(%r11),%rdi mov 16(%r11),%rsi leaq (%r11),%rsp #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_blst_sha256_block_data_order: #ifndef __BLST_PORTABLE__ .section .rdata .p2align 6 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .globl blst_sha256_emit .def blst_sha256_emit; .scl 2; .type 32; .endef .p2align 4 blst_sha256_emit: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 bswapq %r8 movq 24(%rdx),%r11 bswapq %r9 movl %r8d,4(%rcx) bswapq %r10 movl %r9d,12(%rcx) bswapq %r11 movl %r10d,20(%rcx) shrq $32,%r8 movl %r11d,28(%rcx) shrq $32,%r9 movl %r8d,0(%rcx) shrq $32,%r10 movl %r9d,8(%rcx) shrq $32,%r11 movl %r10d,16(%rcx) movl %r11d,24(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl blst_sha256_bcopy .def blst_sha256_bcopy; .scl 2; .type 32; .endef .p2align 4 blst_sha256_bcopy: .byte 0xf3,0x0f,0x1e,0xfa subq %rdx,%rcx .Loop_bcopy: movzbl (%rdx),%eax leaq 1(%rdx),%rdx movb %al,-1(%rcx,%rdx,1) decq %r8 jnz .Loop_bcopy #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl blst_sha256_hcopy .def blst_sha256_hcopy; .scl 2; .type 32; .endef .p2align 4 blst_sha256_hcopy: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq %r8,0(%rcx) movq %r9,8(%rcx) movq %r10,16(%rcx) movq %r11,24(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif #endif .section .pdata .p2align 2 .rva .LSEH_begin_blst_sha256_block_data_order .rva .LSEH_body_blst_sha256_block_data_order .rva .LSEH_info_blst_sha256_block_data_order_prologue .rva .LSEH_body_blst_sha256_block_data_order .rva .LSEH_epilogue_blst_sha256_block_data_order .rva .LSEH_info_blst_sha256_block_data_order_body .rva .LSEH_epilogue_blst_sha256_block_data_order .rva .LSEH_end_blst_sha256_block_data_order .rva .LSEH_info_blst_sha256_block_data_order_epilogue .section .xdata .p2align 3 .LSEH_info_blst_sha256_block_data_order_prologue: .byte 1,4,6,0x05 .byte 4,0x74,2,0 .byte 4,0x64,3,0 .byte 4,0x53 .byte 1,0x50 .long 0,0 .LSEH_info_blst_sha256_block_data_order_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x0b,0x00 .byte 0x00,0xe4,0x0c,0x00 .byte 0x00,0xd4,0x0d,0x00 .byte 0x00,0xc4,0x0e,0x00 .byte 0x00,0x34,0x0f,0x00 .byte 0x00,0x54,0x10,0x00 .byte 0x00,0x74,0x12,0x00 .byte 0x00,0x64,0x13,0x00 .byte 0x00,0x01,0x11,0x00 .byte 0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_epilogue: .byte 1,0,5,11 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0xb3 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/coff/sha256-x86_64.s ================================================ .comm __blst_platform_cap,4 .section .rdata .p2align 6 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .text .globl blst_sha256_block_data_order_shaext .def blst_sha256_block_data_order_shaext; .scl 2; .type 32; .endef .p2align 6 blst_sha256_block_data_order_shaext: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_blst_sha256_block_data_order_shaext: pushq %rbp movq %rsp,%rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx .Lblst_sha256_block_data_order$2: subq $0x50,%rsp movaps %xmm6,-80(%rbp) movaps %xmm7,-64(%rbp) movaps %xmm8,-48(%rbp) movaps %xmm9,-32(%rbp) movaps %xmm10,-16(%rbp) .LSEH_body_blst_sha256_block_data_order_shaext: #ifdef __SGX_LVI_HARDENING__ lfence #endif leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa 256-128(%rcx),%xmm7 pshufd $0x1b,%xmm1,%xmm0 pshufd $0xb1,%xmm1,%xmm1 pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 jmp .Loop_shaext .p2align 4 .Loop_shaext: movdqu (%rsi),%xmm3 movdqu 16(%rsi),%xmm4 movdqu 32(%rsi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%rsi),%xmm6 movdqa 0-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 movdqa 16-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 movdqa 32-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 48-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 64-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 80-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 96-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 112-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 128-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 144-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 160-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 176-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 192-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 208-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 224-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 movdqa 240-128(%rcx),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 paddd %xmm10,%xmm2 paddd %xmm9,%xmm1 jnz .Loop_shaext pshufd $0xb1,%xmm2,%xmm2 pshufd $0x1b,%xmm1,%xmm7 pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) movaps -80(%rbp),%xmm6 movaps -64(%rbp),%xmm7 movaps -48(%rbp),%xmm8 movaps -32(%rbp),%xmm9 movaps -16(%rbp),%xmm10 movq %rbp,%rsp popq %rbp .LSEH_epilogue_blst_sha256_block_data_order_shaext: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_blst_sha256_block_data_order_shaext: .globl blst_sha256_block_data_order .def blst_sha256_block_data_order; .scl 2; .type 32; .endef .p2align 6 blst_sha256_block_data_order: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_blst_sha256_block_data_order: pushq %rbp movq %rsp,%rbp movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx #ifndef __SGX_LVI_HARDENING__ testl $2,__blst_platform_cap(%rip) jnz .Lblst_sha256_block_data_order$2 #endif pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 shlq $4,%rdx subq $88,%rsp leaq (%rsi,%rdx,4),%rdx movq %rdi,-64(%rbp) movq %rdx,-48(%rbp) movaps %xmm6,-128(%rbp) movaps %xmm7,-112(%rbp) movaps %xmm8,-96(%rbp) movaps %xmm9,-80(%rbp) .LSEH_body_blst_sha256_block_data_order: leaq -64(%rsp),%rsp #ifdef __SGX_LVI_HARDENING__ lfence #endif movl 0(%rdi),%eax andq $-64,%rsp movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop_ssse3 .p2align 4 .Lloop_ssse3: movdqa K256+256(%rip),%xmm7 movq %rsi,-56(%rbp) movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 .byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rsi .byte 102,15,56,0,207 movdqa 0(%rsi),%xmm4 movdqa 16(%rsi),%xmm5 .byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 32(%rsi),%xmm6 .byte 102,15,56,0,223 movdqa 48(%rsi),%xmm7 paddd %xmm1,%xmm5 paddd %xmm2,%xmm6 paddd %xmm3,%xmm7 movdqa %xmm4,0(%rsp) movl %eax,%r14d movdqa %xmm5,16(%rsp) movl %ebx,%edi movdqa %xmm6,32(%rsp) xorl %ecx,%edi movdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp .Lssse3_00_47 .p2align 4 .Lssse3_00_47: subq $-64,%rsi rorl $14,%r13d movdqa %xmm1,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm3,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,224,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,250,4 addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm0 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm3,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 4(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm0 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm0 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm0,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 0(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm0 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm0,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,0(%rsp) rorl $14,%r13d movdqa %xmm2,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm0,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,225,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,251,4 addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm1 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm0,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 20(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm1 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm1 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm1,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 16(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm1 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm1,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,16(%rsp) rorl $14,%r13d movdqa %xmm3,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm1,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,226,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,248,4 addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm2 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm1,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 36(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm2 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm2 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm2,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 32(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm2 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm2,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,32(%rsp) rorl $14,%r13d movdqa %xmm0,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm2,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,227,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,249,4 addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm3 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm2,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 52(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm3 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm3 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm3,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 48(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm3 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm3,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,48(%rsp) cmpb $0,67(%rsi) jne .Lssse3_00_47 rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq -64(%rbp),%rdi movl %r14d,%eax movq -56(%rbp),%rsi #ifdef __SGX_LVI_HARDENING__ lfence #endif addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d leaq 64(%rsi),%rsi cmpq -48(%rbp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop_ssse3 xorps %xmm0,%xmm0 movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) movaps %xmm0,48(%rsp) movaps -128(%rbp),%xmm6 movaps -112(%rbp),%xmm7 movaps -96(%rbp),%xmm8 movaps -80(%rbp),%xmm9 movq -40(%rbp),%r15 movq -32(%rbp),%r14 movq -24(%rbp),%r13 movq -16(%rbp),%r12 movq -8(%rbp),%rbx movq %rbp,%rsp popq %rbp .LSEH_epilogue_blst_sha256_block_data_order: mov 8(%rsp),%rdi mov 16(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .LSEH_end_blst_sha256_block_data_order: .globl blst_sha256_emit .def blst_sha256_emit; .scl 2; .type 32; .endef .p2align 4 blst_sha256_emit: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 bswapq %r8 movq 24(%rdx),%r11 bswapq %r9 movl %r8d,4(%rcx) bswapq %r10 movl %r9d,12(%rcx) bswapq %r11 movl %r10d,20(%rcx) shrq $32,%r8 movl %r11d,28(%rcx) shrq $32,%r9 movl %r8d,0(%rcx) shrq $32,%r10 movl %r9d,8(%rcx) shrq $32,%r11 movl %r10d,16(%rcx) movl %r11d,24(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl blst_sha256_bcopy .def blst_sha256_bcopy; .scl 2; .type 32; .endef .p2align 4 blst_sha256_bcopy: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif subq %rdx,%rcx .Loop_bcopy: movzbl (%rdx),%eax leaq 1(%rdx),%rdx movb %al,-1(%rcx,%rdx,1) decq %r8 jnz .Loop_bcopy #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .globl blst_sha256_hcopy .def blst_sha256_hcopy; .scl 2; .type 32; .endef .p2align 4 blst_sha256_hcopy: .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq %r8,0(%rcx) movq %r9,8(%rcx) movq %r10,16(%rcx) movq %r11,24(%rcx) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .section .pdata .p2align 2 .rva .LSEH_begin_blst_sha256_block_data_order_shaext .rva .LSEH_body_blst_sha256_block_data_order_shaext .rva .LSEH_info_blst_sha256_block_data_order_shaext_prologue .rva .LSEH_body_blst_sha256_block_data_order_shaext .rva .LSEH_epilogue_blst_sha256_block_data_order_shaext .rva .LSEH_info_blst_sha256_block_data_order_shaext_body .rva .LSEH_epilogue_blst_sha256_block_data_order_shaext .rva .LSEH_end_blst_sha256_block_data_order_shaext .rva .LSEH_info_blst_sha256_block_data_order_shaext_epilogue .rva .LSEH_begin_blst_sha256_block_data_order .rva .LSEH_body_blst_sha256_block_data_order .rva .LSEH_info_blst_sha256_block_data_order_prologue .rva .LSEH_body_blst_sha256_block_data_order .rva .LSEH_epilogue_blst_sha256_block_data_order .rva .LSEH_info_blst_sha256_block_data_order_body .rva .LSEH_epilogue_blst_sha256_block_data_order .rva .LSEH_end_blst_sha256_block_data_order .rva .LSEH_info_blst_sha256_block_data_order_epilogue .section .xdata .p2align 3 .LSEH_info_blst_sha256_block_data_order_shaext_prologue: .byte 1,4,6,0x05 .byte 4,0x74,2,0 .byte 4,0x64,3,0 .byte 4,0x53 .byte 1,0x50 .long 0,0 .LSEH_info_blst_sha256_block_data_order_shaext_body: .byte 1,0,17,85 .byte 0x00,0x68,0x00,0x00 .byte 0x00,0x78,0x01,0x00 .byte 0x00,0x88,0x02,0x00 .byte 0x00,0x98,0x03,0x00 .byte 0x00,0xa8,0x04,0x00 .byte 0x00,0x74,0x0c,0x00 .byte 0x00,0x64,0x0d,0x00 .byte 0x00,0x53 .byte 0x00,0x92 .byte 0x00,0x50 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_shaext_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_prologue: .byte 1,4,6,0x05 .byte 4,0x74,2,0 .byte 4,0x64,3,0 .byte 4,0x53 .byte 1,0x50 .long 0,0 .LSEH_info_blst_sha256_block_data_order_body: .byte 1,0,25,133 .byte 0x00,0x68,0x00,0x00 .byte 0x00,0x78,0x01,0x00 .byte 0x00,0x88,0x02,0x00 .byte 0x00,0x98,0x03,0x00 .byte 0x00,0xf4,0x0b,0x00 .byte 0x00,0xe4,0x0c,0x00 .byte 0x00,0xd4,0x0d,0x00 .byte 0x00,0xc4,0x0e,0x00 .byte 0x00,0x34,0x0f,0x00 .byte 0x00,0x74,0x12,0x00 .byte 0x00,0x64,0x13,0x00 .byte 0x00,0x53 .byte 0x00,0xf2 .byte 0x00,0x50 .byte 0x00,0x00,0x00,0x00,0x00,0x00 .byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 .byte 0x00,0x00,0x00,0x00 ================================================ FILE: build/elf/add_mod_256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl add_mod_256 .hidden add_mod_256 .type add_mod_256,%function .align 5 add_mod_256: hint #34 ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] adds x8,x8,x12 ldp x14,x15,[x2,#16] adcs x9,x9,x13 ldp x4,x5,[x3] adcs x10,x10,x14 ldp x6,x7,[x3,#16] adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[x0] csel x11,x11,x2,lo stp x10,x11,[x0,#16] ret .size add_mod_256,.-add_mod_256 .globl mul_by_3_mod_256 .hidden mul_by_3_mod_256 .type mul_by_3_mod_256,%function .align 5 mul_by_3_mod_256: hint #34 ldp x12,x13,[x1] ldp x14,x15,[x1,#16] adds x8,x12,x12 ldp x4,x5,[x2] adcs x9,x13,x13 ldp x6,x7,[x2,#16] adcs x10,x14,x14 adcs x11,x15,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo adds x8,x8,x12 adcs x9,x9,x13 adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[x0] csel x11,x11,x2,lo stp x10,x11,[x0,#16] ret .size mul_by_3_mod_256,.-mul_by_3_mod_256 .globl lshift_mod_256 .hidden lshift_mod_256 .type lshift_mod_256,%function .align 5 lshift_mod_256: hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] .Loop_lshift_mod_256: adds x8,x8,x8 sub x2,x2,#1 adcs x9,x9,x9 adcs x10,x10,x10 adcs x11,x11,x11 adc x3,xzr,xzr subs x12,x8,x4 sbcs x13,x9,x5 sbcs x14,x10,x6 sbcs x15,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x12,lo csel x9,x9,x13,lo csel x10,x10,x14,lo csel x11,x11,x15,lo cbnz x2,.Loop_lshift_mod_256 stp x8,x9,[x0] stp x10,x11,[x0,#16] ret .size lshift_mod_256,.-lshift_mod_256 .globl rshift_mod_256 .hidden rshift_mod_256 .type rshift_mod_256,%function .align 5 rshift_mod_256: hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] .Loop_rshift: adds x12,x8,x4 sub x2,x2,#1 adcs x13,x9,x5 adcs x14,x10,x6 adcs x15,x11,x7 adc x3,xzr,xzr tst x8,#1 csel x12,x12,x8,ne csel x13,x13,x9,ne csel x14,x14,x10,ne csel x15,x15,x11,ne csel x3,x3,xzr,ne extr x8,x13,x12,#1 extr x9,x14,x13,#1 extr x10,x15,x14,#1 extr x11,x3,x15,#1 cbnz x2,.Loop_rshift stp x8,x9,[x0] stp x10,x11,[x0,#16] ret .size rshift_mod_256,.-rshift_mod_256 .globl cneg_mod_256 .hidden cneg_mod_256 .type cneg_mod_256,%function .align 5 cneg_mod_256: ldp x8,x9,[x1] ldp x4,x5,[x3] ldp x10,x11,[x1,#16] subs x12,x4,x8 ldp x6,x7,[x3,#16] orr x4,x8,x9 sbcs x13,x5,x9 orr x5,x10,x11 sbcs x14,x6,x10 orr x3,x4,x5 sbc x15,x7,x11 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x8,x8,x12,eq csel x9,x9,x13,eq csel x10,x10,x14,eq stp x8,x9,[x0] csel x11,x11,x15,eq stp x10,x11,[x0,#16] ret .size cneg_mod_256,.-cneg_mod_256 .globl sub_mod_256 .hidden sub_mod_256 .type sub_mod_256,%function .align 5 sub_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] subs x8,x8,x12 ldp x14,x15,[x2,#16] sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 stp x8,x9,[x0] adc x11,x11,x7 stp x10,x11,[x0,#16] ret .size sub_mod_256,.-sub_mod_256 .globl check_mod_256 .hidden check_mod_256 .type check_mod_256,%function .align 5 check_mod_256: ldp x8,x9,[x0] ldp x10,x11,[x0,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif subs xzr,x8,x4 sbcs xzr,x9,x5 orr x8,x8,x9 sbcs xzr,x10,x6 orr x8,x8,x10 sbcs xzr,x11,x7 orr x8,x8,x11 sbc x1,xzr,xzr cmp x8,#0 mov x0,#1 csel x0,x0,xzr,ne and x0,x0,x1 ret .size check_mod_256,.-check_mod_256 .globl add_n_check_mod_256 .hidden add_n_check_mod_256 .type add_n_check_mod_256,%function .align 5 add_n_check_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif adds x8,x8,x12 ldp x4,x5,[x3] adcs x9,x9,x13 ldp x6,x7,[x3,#16] adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret .size add_n_check_mod_256,.-add_n_check_mod_256 .globl sub_n_check_mod_256 .hidden sub_n_check_mod_256 .type sub_n_check_mod_256,%function .align 5 sub_n_check_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif subs x8,x8,x12 sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 adc x11,x11,x7 orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret .size sub_n_check_mod_256,.-sub_n_check_mod_256 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/add_mod_256-x86_64.s ================================================ .text .globl add_mod_256 .hidden add_mod_256 .type add_mod_256,@function .align 32 add_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 .Loaded_a_add_mod_256: addq 0(%rdx),%r8 adcq 8(%rdx),%r9 movq %r8,%rax adcq 16(%rdx),%r10 movq %r9,%rsi adcq 24(%rdx),%r11 sbbq %rdx,%rdx movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%rdx cmovcq %rax,%r8 cmovcq %rsi,%r9 movq %r8,0(%rdi) cmovcq %rbx,%r10 movq %r9,8(%rdi) cmovcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size add_mod_256,.-add_mod_256 .globl mul_by_3_mod_256 .hidden mul_by_3_mod_256 .type mul_by_3_mod_256,@function .align 32 mul_by_3_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %rsi,%rdx movq 24(%rsi),%r11 call __lshift_mod_256 movq 0(%rsp),%r12 .cfi_restore %r12 jmp .Loaded_a_add_mod_256 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_by_3_mod_256,.-mul_by_3_mod_256 .type __lshift_mod_256,@function .align 32 __lshift_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq %r8,%r8 adcq %r9,%r9 movq %r8,%rax adcq %r10,%r10 movq %r9,%rsi adcq %r11,%r11 sbbq %r12,%r12 movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%r12 cmovcq %rax,%r8 cmovcq %rsi,%r9 cmovcq %rbx,%r10 cmovcq %rbp,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rax lfence jmpq *%rax ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __lshift_mod_256,.-__lshift_mod_256 .globl lshift_mod_256 .hidden lshift_mod_256 .type lshift_mod_256,@function .align 32 lshift_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 .Loop_lshift_mod_256: call __lshift_mod_256 decl %edx jnz .Loop_lshift_mod_256 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r12 .cfi_restore %r12 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size lshift_mod_256,.-lshift_mod_256 .globl rshift_mod_256 .hidden rshift_mod_256 .type rshift_mod_256,@function .align 32 rshift_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rbp movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 .Loop_rshift_mod_256: movq %rbp,%r8 andq $1,%rbp movq 0(%rcx),%rax negq %rbp movq 8(%rcx),%rsi movq 16(%rcx),%rbx andq %rbp,%rax andq %rbp,%rsi andq %rbp,%rbx andq 24(%rcx),%rbp addq %rax,%r8 adcq %rsi,%r9 adcq %rbx,%r10 adcq %rbp,%r11 sbbq %rax,%rax shrq $1,%r8 movq %r9,%rbp shrq $1,%r9 movq %r10,%rbx shrq $1,%r10 movq %r11,%rsi shrq $1,%r11 shlq $63,%rbp shlq $63,%rbx orq %r8,%rbp shlq $63,%rsi orq %rbx,%r9 shlq $63,%rax orq %rsi,%r10 orq %rax,%r11 decl %edx jnz .Loop_rshift_mod_256 movq %rbp,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size rshift_mod_256,.-rshift_mod_256 .globl cneg_mod_256 .hidden cneg_mod_256 .type cneg_mod_256,@function .align 32 cneg_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r12 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %r12,%r8 movq 24(%rsi),%r11 orq %r9,%r12 orq %r10,%r12 orq %r11,%r12 movq $-1,%rbp movq 0(%rcx),%rax cmovnzq %rbp,%r12 movq 8(%rcx),%rsi movq 16(%rcx),%rbx andq %r12,%rax movq 24(%rcx),%rbp andq %r12,%rsi andq %r12,%rbx andq %r12,%rbp subq %r8,%rax sbbq %r9,%rsi sbbq %r10,%rbx sbbq %r11,%rbp orq %rdx,%rdx cmovzq %r8,%rax cmovzq %r9,%rsi movq %rax,0(%rdi) cmovzq %r10,%rbx movq %rsi,8(%rdi) cmovzq %r11,%rbp movq %rbx,16(%rdi) movq %rbp,24(%rdi) movq 0(%rsp),%r12 .cfi_restore %r12 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size cneg_mod_256,.-cneg_mod_256 .globl sub_mod_256 .hidden sub_mod_256 .type sub_mod_256,@function .align 32 sub_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 subq 0(%rdx),%r8 movq 0(%rcx),%rax sbbq 8(%rdx),%r9 movq 8(%rcx),%rsi sbbq 16(%rdx),%r10 movq 16(%rcx),%rbx sbbq 24(%rdx),%r11 movq 24(%rcx),%rbp sbbq %rdx,%rdx andq %rdx,%rax andq %rdx,%rsi andq %rdx,%rbx andq %rdx,%rbp addq %rax,%r8 adcq %rsi,%r9 movq %r8,0(%rdi) adcq %rbx,%r10 movq %r9,8(%rdi) adcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sub_mod_256,.-sub_mod_256 .globl check_mod_256 .hidden check_mod_256 .type check_mod_256,@function .align 32 check_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%rax movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq %rax,%r8 orq %r9,%rax orq %r10,%rax orq %r11,%rax subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq %rsi,%rsi movq $1,%rdx cmpq $0,%rax cmovneq %rdx,%rax andq %rsi,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size check_mod_256,.-check_mod_256 .globl add_n_check_mod_256 .hidden add_n_check_mod_256 .type add_n_check_mod_256,@function .align 32 add_n_check_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 movq %r8,%rax adcq 16(%rdx),%r10 movq %r9,%rsi adcq 24(%rdx),%r11 sbbq %rdx,%rdx movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%rdx cmovcq %rax,%r8 cmovcq %rsi,%r9 movq %r8,0(%rdi) cmovcq %rbx,%r10 movq %r9,8(%rdi) cmovcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) orq %r9,%r8 orq %r11,%r10 orq %r10,%r8 movq $1,%rax cmovzq %r8,%rax movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size add_n_check_mod_256,.-add_n_check_mod_256 .globl sub_n_check_mod_256 .hidden sub_n_check_mod_256 .type sub_n_check_mod_256,@function .align 32 sub_n_check_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 subq 0(%rdx),%r8 movq 0(%rcx),%rax sbbq 8(%rdx),%r9 movq 8(%rcx),%rsi sbbq 16(%rdx),%r10 movq 16(%rcx),%rbx sbbq 24(%rdx),%r11 movq 24(%rcx),%rbp sbbq %rdx,%rdx andq %rdx,%rax andq %rdx,%rsi andq %rdx,%rbx andq %rdx,%rbp addq %rax,%r8 adcq %rsi,%r9 movq %r8,0(%rdi) adcq %rbx,%r10 movq %r9,8(%rdi) adcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) orq %r9,%r8 orq %r11,%r10 orq %r10,%r8 movq $1,%rax cmovzq %r8,%rax movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sub_n_check_mod_256,.-sub_n_check_mod_256 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/add_mod_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl add_mod_384 .hidden add_mod_384 .type add_mod_384,%function .align 5 add_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size add_mod_384,.-add_mod_384 .type __add_mod_384,%function .align 5 __add_mod_384: ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] __add_mod_384_ab_are_loaded: adds x10,x10,x16 adcs x11,x11,x17 adcs x12,x12,x19 adcs x13,x13,x20 adcs x14,x14,x21 adcs x15,x15,x22 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .size __add_mod_384,.-__add_mod_384 .globl add_mod_384x .hidden add_mod_384x .type add_mod_384x,%function .align 5 add_mod_384x: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size add_mod_384x,.-add_mod_384x .globl rshift_mod_384 .hidden rshift_mod_384 .type rshift_mod_384,%function .align 5 rshift_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] .Loop_rshift_mod_384: sub x2,x2,#1 bl __rshift_mod_384 cbnz x2,.Loop_rshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size rshift_mod_384,.-rshift_mod_384 .type __rshift_mod_384,%function .align 5 __rshift_mod_384: sbfx x22,x10,#0,#1 and x16,x22,x4 and x17,x22,x5 adds x10,x10,x16 and x19,x22,x6 adcs x11,x11,x17 and x20,x22,x7 adcs x12,x12,x19 and x21,x22,x8 adcs x13,x13,x20 and x22,x22,x9 adcs x14,x14,x21 extr x10,x11,x10,#1 // a[0:5] >>= 1 adcs x15,x15,x22 extr x11,x12,x11,#1 adc x22,xzr,xzr extr x12,x13,x12,#1 extr x13,x14,x13,#1 extr x14,x15,x14,#1 extr x15,x22,x15,#1 ret .size __rshift_mod_384,.-__rshift_mod_384 .globl div_by_2_mod_384 .hidden div_by_2_mod_384 .type div_by_2_mod_384,%function .align 5 div_by_2_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __rshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size div_by_2_mod_384,.-div_by_2_mod_384 .globl lshift_mod_384 .hidden lshift_mod_384 .type lshift_mod_384,%function .align 5 lshift_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] .Loop_lshift_mod_384: sub x2,x2,#1 bl __lshift_mod_384 cbnz x2,.Loop_lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size lshift_mod_384,.-lshift_mod_384 .type __lshift_mod_384,%function .align 5 __lshift_mod_384: adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .size __lshift_mod_384,.-__lshift_mod_384 .globl mul_by_3_mod_384 .hidden mul_by_3_mod_384 .type mul_by_3_mod_384,%function .align 5 mul_by_3_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_3_mod_384,.-mul_by_3_mod_384 .globl mul_by_8_mod_384 .hidden mul_by_8_mod_384 .type mul_by_8_mod_384,%function .align 5 mul_by_8_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_8_mod_384,.-mul_by_8_mod_384 .globl mul_by_3_mod_384x .hidden mul_by_3_mod_384x .type mul_by_3_mod_384x,%function .align 5 mul_by_3_mod_384x: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 ldp x16,x17,[x1,#48] ldp x19,x20,[x1,#64] ldp x21,x22,[x1,#80] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_3_mod_384x,.-mul_by_3_mod_384x .globl mul_by_8_mod_384x .hidden mul_by_8_mod_384x .type mul_by_8_mod_384x,%function .align 5 mul_by_8_mod_384x: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_8_mod_384x,.-mul_by_8_mod_384x .globl cneg_mod_384 .hidden cneg_mod_384 .type cneg_mod_384,%function .align 5 cneg_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x4,x5,[x3] ldp x12,x13,[x1,#16] ldp x6,x7,[x3,#16] subs x16,x4,x10 ldp x14,x15,[x1,#32] ldp x8,x9,[x3,#32] orr x3,x10,x11 sbcs x17,x5,x11 orr x3,x3,x12 sbcs x19,x6,x12 orr x3,x3,x13 sbcs x20,x7,x13 orr x3,x3,x14 sbcs x21,x8,x14 orr x3,x3,x15 sbc x22,x9,x15 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x10,x10,x16,eq csel x11,x11,x17,eq csel x12,x12,x19,eq csel x13,x13,x20,eq stp x10,x11,[x0] csel x14,x14,x21,eq stp x12,x13,[x0,#16] csel x15,x15,x22,eq stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size cneg_mod_384,.-cneg_mod_384 .globl sub_mod_384 .hidden sub_mod_384 .type sub_mod_384,%function .align 5 sub_mod_384: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sub_mod_384,.-sub_mod_384 .type __sub_mod_384,%function .align 5 __sub_mod_384: ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] subs x10,x10,x16 sbcs x11,x11,x17 sbcs x12,x12,x19 sbcs x13,x13,x20 sbcs x14,x14,x21 sbcs x15,x15,x22 sbc x3,xzr,xzr and x16,x4,x3 and x17,x5,x3 adds x10,x10,x16 and x19,x6,x3 adcs x11,x11,x17 and x20,x7,x3 adcs x12,x12,x19 and x21,x8,x3 adcs x13,x13,x20 and x22,x9,x3 adcs x14,x14,x21 adc x15,x15,x22 ret .size __sub_mod_384,.-__sub_mod_384 .globl sub_mod_384x .hidden sub_mod_384x .type sub_mod_384x,%function .align 5 sub_mod_384x: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sub_mod_384x,.-sub_mod_384x .globl mul_by_1_plus_i_mod_384x .hidden mul_by_1_plus_i_mod_384x .type mul_by_1_plus_i_mod_384x,%function .align 5 mul_by_1_plus_i_mod_384x: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] add x2,x1,#48 bl __sub_mod_384 // a->re - a->im ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __add_mod_384_ab_are_loaded // a->re + a->im ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x .globl sgn0_pty_mod_384 .hidden sgn0_pty_mod_384 .type sgn0_pty_mod_384,%function .align 5 sgn0_pty_mod_384: hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x0,x10,#1 adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x3,x3,xzr mvn x3,x3 and x3,x3,#2 orr x0,x0,x3 ret .size sgn0_pty_mod_384,.-sgn0_pty_mod_384 .globl sgn0_pty_mod_384x .hidden sgn0_pty_mod_384x .type sgn0_pty_mod_384x,%function .align 5 sgn0_pty_mod_384x: hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x2,x10,#1 orr x3,x10,x11 adds x10,x10,x10 orr x3,x3,x12 adcs x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr ldp x10,x11,[x0,#48] ldp x12,x13,[x0,#64] ldp x14,x15,[x0,#80] mvn x16,x16 and x16,x16,#2 orr x2,x2,x16 and x0,x10,#1 orr x1,x10,x11 adds x10,x10,x10 orr x1,x1,x12 adcs x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr mvn x16,x16 and x16,x16,#2 orr x0,x0,x16 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ret .size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x .globl vec_select_32 .hidden vec_select_32 .type vec_select_32,%function .align 5 vec_select_32: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d}, [x1] cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d}, [x2] bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b st1 {v0.2d, v1.2d}, [x0] ret .size vec_select_32,.-vec_select_32 .globl vec_select_48 .hidden vec_select_48 .type vec_select_48,%function .align 5 vec_select_48: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret .size vec_select_48,.-vec_select_48 .globl vec_select_96 .hidden vec_select_96 .type vec_select_96,%function .align 5 vec_select_96: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .size vec_select_96,.-vec_select_96 .globl vec_select_192 .hidden vec_select_192 .type vec_select_192,%function .align 5 vec_select_192: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .size vec_select_192,.-vec_select_192 .globl vec_select_144 .hidden vec_select_144 .type vec_select_144,%function .align 5 vec_select_144: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret .size vec_select_144,.-vec_select_144 .globl vec_select_288 .hidden vec_select_288 .type vec_select_288,%function .align 5 vec_select_288: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .size vec_select_288,.-vec_select_288 .globl vec_prefetch .hidden vec_prefetch .type vec_prefetch,%function .align 5 vec_prefetch: hint #34 add x1, x1, x0 sub x1, x1, #1 mov x2, #64 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi prfm pldl1keep, [x0] ret .size vec_prefetch,.-vec_prefetch .globl vec_is_zero_16x .hidden vec_is_zero_16x .type vec_is_zero_16x,%function .align 5 vec_is_zero_16x: hint #34 ld1 {v0.2d}, [x0], #16 lsr x1, x1, #4 sub x1, x1, #1 cbz x1, .Loop_is_zero_done .Loop_is_zero: ld1 {v1.2d}, [x0], #16 orr v0.16b, v0.16b, v1.16b sub x1, x1, #1 cbnz x1, .Loop_is_zero .Loop_is_zero_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .size vec_is_zero_16x,.-vec_is_zero_16x .globl vec_is_equal_16x .hidden vec_is_equal_16x .type vec_is_equal_16x,%function .align 5 vec_is_equal_16x: hint #34 ld1 {v0.2d}, [x0], #16 ld1 {v1.2d}, [x1], #16 lsr x2, x2, #4 eor v0.16b, v0.16b, v1.16b .Loop_is_equal: sub x2, x2, #1 cbz x2, .Loop_is_equal_done ld1 {v1.2d}, [x0], #16 ld1 {v2.2d}, [x1], #16 eor v1.16b, v1.16b, v2.16b orr v0.16b, v0.16b, v1.16b b .Loop_is_equal nop .Loop_is_equal_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .size vec_is_equal_16x,.-vec_is_equal_16x #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/add_mod_384-x86_64.s ================================================ .text .globl add_mod_384 .hidden add_mod_384 .type add_mod_384,@function .align 32 add_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 call __add_mod_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size add_mod_384,.-add_mod_384 .type __add_mod_384,@function .align 32 __add_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __add_mod_384_a_is_loaded: addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __add_mod_384,.-__add_mod_384 .globl add_mod_384x .hidden add_mod_384x .type add_mod_384x,@function .align 32 add_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $24,%rsp .cfi_adjust_cfa_offset 24 movq %rsi,0(%rsp) movq %rdx,8(%rsp) leaq 48(%rsi),%rsi leaq 48(%rdx),%rdx leaq 48(%rdi),%rdi call __add_mod_384 movq 0(%rsp),%rsi movq 8(%rsp),%rdx leaq -48(%rdi),%rdi call __add_mod_384 movq 24+0(%rsp),%r15 .cfi_restore %r15 movq 24+8(%rsp),%r14 .cfi_restore %r14 movq 24+16(%rsp),%r13 .cfi_restore %r13 movq 24+24(%rsp),%r12 .cfi_restore %r12 movq 24+32(%rsp),%rbx .cfi_restore %rbx movq 24+40(%rsp),%rbp .cfi_restore %rbp leaq 24+48(%rsp),%rsp .cfi_adjust_cfa_offset -24-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size add_mod_384x,.-add_mod_384x .globl rshift_mod_384 .hidden rshift_mod_384 .type rshift_mod_384,@function .align 32 rshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 .Loop_rshift_mod_384: call __rshift_mod_384 decl %edx jnz .Loop_rshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size rshift_mod_384,.-rshift_mod_384 .type __rshift_mod_384,@function .align 32 __rshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rsi movq 0(%rcx),%r14 andq %r8,%rsi movq 8(%rcx),%r15 negq %rsi movq 16(%rcx),%rax andq %rsi,%r14 movq 24(%rcx),%rbx andq %rsi,%r15 movq 32(%rcx),%rbp andq %rsi,%rax andq %rsi,%rbx andq %rsi,%rbp andq 40(%rcx),%rsi addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rax adcq %r11,%rbx adcq %r12,%rbp adcq %r13,%rsi sbbq %r13,%r13 shrq $1,%r14 movq %r15,%r8 shrq $1,%r15 movq %rax,%r9 shrq $1,%rax movq %rbx,%r10 shrq $1,%rbx movq %rbp,%r11 shrq $1,%rbp movq %rsi,%r12 shrq $1,%rsi shlq $63,%r8 shlq $63,%r9 orq %r14,%r8 shlq $63,%r10 orq %r15,%r9 shlq $63,%r11 orq %rax,%r10 shlq $63,%r12 orq %rbx,%r11 shlq $63,%r13 orq %rbp,%r12 orq %rsi,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r14 lfence jmpq *%r14 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __rshift_mod_384,.-__rshift_mod_384 .globl div_by_2_mod_384 .hidden div_by_2_mod_384 .type div_by_2_mod_384,@function .align 32 div_by_2_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq %rdx,%rcx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 call __rshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size div_by_2_mod_384,.-div_by_2_mod_384 .globl lshift_mod_384 .hidden lshift_mod_384 .type lshift_mod_384,@function .align 32 lshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 .Loop_lshift_mod_384: addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 movq %r8,%r14 adcq %r11,%r11 movq %r9,%r15 adcq %r12,%r12 movq %r10,%rax adcq %r13,%r13 movq %r11,%rbx sbbq %rdi,%rdi subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdi movq (%rsp),%rdi cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 cmovcq %rbx,%r11 cmovcq %rbp,%r12 cmovcq %rsi,%r13 decl %edx jnz .Loop_lshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size lshift_mod_384,.-lshift_mod_384 .type __lshift_mod_384,@function .align 32 __lshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 movq %r8,%r14 adcq %r11,%r11 movq %r9,%r15 adcq %r12,%r12 movq %r10,%rax adcq %r13,%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 cmovcq %rbx,%r11 cmovcq %rbp,%r12 cmovcq %rsi,%r13 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __lshift_mod_384,.-__lshift_mod_384 .globl mul_by_3_mod_384 .hidden mul_by_3_mod_384 .type mul_by_3_mod_384,@function .align 32 mul_by_3_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 movq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_by_3_mod_384,.-mul_by_3_mod_384 .globl mul_by_8_mod_384 .hidden mul_by_8_mod_384 .type mul_by_8_mod_384,@function .align 32 mul_by_8_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_by_8_mod_384,.-mul_by_8_mod_384 .globl mul_by_3_mod_384x .hidden mul_by_3_mod_384x .type mul_by_3_mod_384x,@function .align 32 mul_by_3_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 movq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq (%rsp),%rsi leaq 48(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rsi),%r8 movq 56(%rsi),%r9 movq 64(%rsi),%r10 movq 72(%rsi),%r11 movq 80(%rsi),%r12 movq 88(%rsi),%r13 call __lshift_mod_384 movq $48,%rdx addq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_by_3_mod_384x,.-mul_by_3_mod_384x .globl mul_by_8_mod_384x .hidden mul_by_8_mod_384x .type mul_by_8_mod_384x,@function .align 32 mul_by_8_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq (%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48+0(%rsi),%r8 movq 48+8(%rsi),%r9 movq 48+16(%rsi),%r10 movq 48+24(%rsi),%r11 movq 48+32(%rsi),%r12 movq 48+40(%rsi),%r13 call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq %r8,48+0(%rdi) movq %r9,48+8(%rdi) movq %r10,48+16(%rdi) movq %r11,48+24(%rdi) movq %r12,48+32(%rdi) movq %r13,48+40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_by_8_mod_384x,.-mul_by_8_mod_384x .globl cneg_mod_384 .hidden cneg_mod_384 .type cneg_mod_384,@function .align 32 cneg_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdx .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %rdx,%r8 movq 24(%rsi),%r11 orq %r9,%rdx movq 32(%rsi),%r12 orq %r10,%rdx movq 40(%rsi),%r13 orq %r11,%rdx movq $-1,%rsi orq %r12,%rdx orq %r13,%rdx movq 0(%rcx),%r14 cmovnzq %rsi,%rdx movq 8(%rcx),%r15 movq 16(%rcx),%rax andq %rdx,%r14 movq 24(%rcx),%rbx andq %rdx,%r15 movq 32(%rcx),%rbp andq %rdx,%rax movq 40(%rcx),%rsi andq %rdx,%rbx movq 0(%rsp),%rcx andq %rdx,%rbp andq %rdx,%rsi subq %r8,%r14 sbbq %r9,%r15 sbbq %r10,%rax sbbq %r11,%rbx sbbq %r12,%rbp sbbq %r13,%rsi orq %rcx,%rcx cmovzq %r8,%r14 cmovzq %r9,%r15 cmovzq %r10,%rax movq %r14,0(%rdi) cmovzq %r11,%rbx movq %r15,8(%rdi) cmovzq %r12,%rbp movq %rax,16(%rdi) cmovzq %r13,%rsi movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rsi,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size cneg_mod_384,.-cneg_mod_384 .globl sub_mod_384 .hidden sub_mod_384 .type sub_mod_384,@function .align 32 sub_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 call __sub_mod_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sub_mod_384,.-sub_mod_384 .type __sub_mod_384,@function .align 32 __sub_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __sub_mod_384,.-__sub_mod_384 .globl sub_mod_384x .hidden sub_mod_384x .type sub_mod_384x,@function .align 32 sub_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $24,%rsp .cfi_adjust_cfa_offset 24 movq %rsi,0(%rsp) movq %rdx,8(%rsp) leaq 48(%rsi),%rsi leaq 48(%rdx),%rdx leaq 48(%rdi),%rdi call __sub_mod_384 movq 0(%rsp),%rsi movq 8(%rsp),%rdx leaq -48(%rdi),%rdi call __sub_mod_384 movq 24+0(%rsp),%r15 .cfi_restore %r15 movq 24+8(%rsp),%r14 .cfi_restore %r14 movq 24+16(%rsp),%r13 .cfi_restore %r13 movq 24+24(%rsp),%r12 .cfi_restore %r12 movq 24+32(%rsp),%rbx .cfi_restore %rbx movq 24+40(%rsp),%rbp .cfi_restore %rbp leaq 24+48(%rsp),%rsp .cfi_adjust_cfa_offset -24-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sub_mod_384x,.-sub_mod_384x .globl mul_by_1_plus_i_mod_384x .hidden mul_by_1_plus_i_mod_384x .type mul_by_1_plus_i_mod_384x,@function .align 32 mul_by_1_plus_i_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $56,%rsp .cfi_adjust_cfa_offset 56 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rbx adcq 72(%rsi),%r11 movq %r12,%rcx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 movq %rdi,48(%rsp) sbbq %rdi,%rdi subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rbx sbbq 80(%rsi),%rcx sbbq 88(%rsi),%rbp sbbq %rsi,%rsi movq %r8,0(%rsp) movq 0(%rdx),%r8 movq %r9,8(%rsp) movq 8(%rdx),%r9 movq %r10,16(%rsp) movq 16(%rdx),%r10 movq %r11,24(%rsp) movq 24(%rdx),%r11 movq %r12,32(%rsp) andq %rsi,%r8 movq 32(%rdx),%r12 movq %r13,40(%rsp) andq %rsi,%r9 movq 40(%rdx),%r13 andq %rsi,%r10 andq %rsi,%r11 andq %rsi,%r12 andq %rsi,%r13 movq 48(%rsp),%rsi addq %r8,%r14 movq 0(%rsp),%r8 adcq %r9,%r15 movq 8(%rsp),%r9 adcq %r10,%rax movq 16(%rsp),%r10 adcq %r11,%rbx movq 24(%rsp),%r11 adcq %r12,%rcx movq 32(%rsp),%r12 adcq %r13,%rbp movq 40(%rsp),%r13 movq %r14,0(%rsi) movq %r8,%r14 movq %r15,8(%rsi) movq %rax,16(%rsi) movq %r9,%r15 movq %rbx,24(%rsi) movq %rcx,32(%rsi) movq %r10,%rax movq %rbp,40(%rsi) subq 0(%rdx),%r8 movq %r11,%rbx sbbq 8(%rdx),%r9 sbbq 16(%rdx),%r10 movq %r12,%rcx sbbq 24(%rdx),%r11 sbbq 32(%rdx),%r12 movq %r13,%rbp sbbq 40(%rdx),%r13 sbbq $0,%rdi cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,48(%rsi) cmovcq %rbx,%r11 movq %r9,56(%rsi) cmovcq %rcx,%r12 movq %r10,64(%rsi) cmovcq %rbp,%r13 movq %r11,72(%rsi) movq %r12,80(%rsi) movq %r13,88(%rsi) movq 56+0(%rsp),%r15 .cfi_restore %r15 movq 56+8(%rsp),%r14 .cfi_restore %r14 movq 56+16(%rsp),%r13 .cfi_restore %r13 movq 56+24(%rsp),%r12 .cfi_restore %r12 movq 56+32(%rsp),%rbx .cfi_restore %rbx movq 56+40(%rsp),%rbp .cfi_restore %rbp leaq 56+48(%rsp),%rsp .cfi_adjust_cfa_offset -56-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x .globl sgn0_pty_mod_384 .hidden sgn0_pty_mod_384 .type sgn0_pty_mod_384,@function .align 32 sgn0_pty_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%rcx movq 40(%rdi),%rdx xorq %rax,%rax movq %r8,%rdi addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rax subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rax notq %rax andq $1,%rdi andq $2,%rax orq %rdi,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sgn0_pty_mod_384,.-sgn0_pty_mod_384 .globl sgn0_pty_mod_384x .hidden sgn0_pty_mod_384x .type sgn0_pty_mod_384x,@function .align 32 sgn0_pty_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rdi),%r8 movq 56(%rdi),%r9 movq 64(%rdi),%r10 movq 72(%rdi),%r11 movq 80(%rdi),%rcx movq 88(%rdi),%rdx movq %r8,%rbx orq %r9,%r8 orq %r10,%r8 orq %r11,%r8 orq %rcx,%r8 orq %rdx,%r8 leaq 0(%rdi),%rax xorq %rdi,%rdi movq %rbx,%rbp addq %rbx,%rbx adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rdi subq 0(%rsi),%rbx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rdi movq %r8,0(%rsp) notq %rdi andq $1,%rbp andq $2,%rdi orq %rbp,%rdi movq 0(%rax),%r8 movq 8(%rax),%r9 movq 16(%rax),%r10 movq 24(%rax),%r11 movq 32(%rax),%rcx movq 40(%rax),%rdx movq %r8,%rbx orq %r9,%r8 orq %r10,%r8 orq %r11,%r8 orq %rcx,%r8 orq %rdx,%r8 xorq %rax,%rax movq %rbx,%rbp addq %rbx,%rbx adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rax subq 0(%rsi),%rbx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rax movq 0(%rsp),%rbx notq %rax testq %r8,%r8 cmovzq %rdi,%rbp testq %rbx,%rbx cmovnzq %rdi,%rax andq $1,%rbp andq $2,%rax orq %rbp,%rax movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x .globl vec_select_32 .hidden vec_select_32 .type vec_select_32,@function .align 32 vec_select_32: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 16(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 16(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 16(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-16(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-16(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-16(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,16-16(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_select_32,.-vec_select_32 .globl vec_select_48 .hidden vec_select_48 .type vec_select_48,@function .align 32 vec_select_48: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 24(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 24(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 24(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-24(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-24(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-24(%rdi) pand %xmm4,%xmm2 movdqu 16+16-24(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-24(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-24(%rdi) pand %xmm4,%xmm0 pand %xmm5,%xmm1 por %xmm1,%xmm0 movdqu %xmm0,32-24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_select_48,.-vec_select_48 .globl vec_select_96 .hidden vec_select_96 .type vec_select_96,@function .align 32 vec_select_96: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 48(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 48(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 48(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-48(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-48(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-48(%rdi) pand %xmm4,%xmm2 movdqu 16+16-48(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-48(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-48(%rdi) pand %xmm4,%xmm0 movdqu 32+16-48(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-48(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-48(%rdi) pand %xmm4,%xmm2 movdqu 48+16-48(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-48(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-48(%rdi) pand %xmm4,%xmm0 movdqu 64+16-48(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-48(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-48(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,80-48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_select_96,.-vec_select_96 .globl vec_select_192 .hidden vec_select_192 .type vec_select_192,@function .align 32 vec_select_192: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 96(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 96(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 96(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-96(%rdi) pand %xmm4,%xmm2 movdqu 16+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-96(%rdi) pand %xmm4,%xmm0 movdqu 32+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-96(%rdi) pand %xmm4,%xmm2 movdqu 48+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-96(%rdi) pand %xmm4,%xmm0 movdqu 64+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-96(%rdi) pand %xmm4,%xmm2 movdqu 80+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-96(%rdi) pand %xmm4,%xmm0 movdqu 96+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-96(%rdi) pand %xmm4,%xmm2 movdqu 112+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-96(%rdi) pand %xmm4,%xmm0 movdqu 128+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 128+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,128-96(%rdi) pand %xmm4,%xmm2 movdqu 144+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 144+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,144-96(%rdi) pand %xmm4,%xmm0 movdqu 160+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 160+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,160-96(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,176-96(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_select_192,.-vec_select_192 .globl vec_select_144 .hidden vec_select_144 .type vec_select_144,@function .align 32 vec_select_144: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 72(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 72(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 72(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-72(%rdi) pand %xmm4,%xmm2 movdqu 16+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-72(%rdi) pand %xmm4,%xmm0 movdqu 32+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-72(%rdi) pand %xmm4,%xmm2 movdqu 48+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-72(%rdi) pand %xmm4,%xmm0 movdqu 64+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-72(%rdi) pand %xmm4,%xmm2 movdqu 80+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-72(%rdi) pand %xmm4,%xmm0 movdqu 96+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-72(%rdi) pand %xmm4,%xmm2 movdqu 112+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-72(%rdi) pand %xmm4,%xmm0 pand %xmm5,%xmm1 por %xmm1,%xmm0 movdqu %xmm0,128-72(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_select_144,.-vec_select_144 .globl vec_select_288 .hidden vec_select_288 .type vec_select_288,@function .align 32 vec_select_288: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 144(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 144(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 144(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-144(%rdi) pand %xmm4,%xmm2 movdqu 16+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-144(%rdi) pand %xmm4,%xmm0 movdqu 32+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-144(%rdi) pand %xmm4,%xmm2 movdqu 48+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-144(%rdi) pand %xmm4,%xmm0 movdqu 64+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-144(%rdi) pand %xmm4,%xmm2 movdqu 80+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-144(%rdi) pand %xmm4,%xmm0 movdqu 96+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-144(%rdi) pand %xmm4,%xmm2 movdqu 112+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-144(%rdi) pand %xmm4,%xmm0 movdqu 128+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 128+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,128-144(%rdi) pand %xmm4,%xmm2 movdqu 144+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 144+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,144-144(%rdi) pand %xmm4,%xmm0 movdqu 160+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 160+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,160-144(%rdi) pand %xmm4,%xmm2 movdqu 176+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 176+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,176-144(%rdi) pand %xmm4,%xmm0 movdqu 192+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 192+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,192-144(%rdi) pand %xmm4,%xmm2 movdqu 208+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 208+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,208-144(%rdi) pand %xmm4,%xmm0 movdqu 224+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 224+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,224-144(%rdi) pand %xmm4,%xmm2 movdqu 240+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 240+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,240-144(%rdi) pand %xmm4,%xmm0 movdqu 256+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 256+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,256-144(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,272-144(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_select_288,.-vec_select_288 .globl vec_prefetch .hidden vec_prefetch .type vec_prefetch,@function .align 32 vec_prefetch: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa leaq -1(%rdi,%rsi,1),%rsi movq $64,%rax xorq %r8,%r8 #ifdef __SGX_LVI_HARDENING__ lfence #endif prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi prefetchnta (%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_prefetch,.-vec_prefetch .globl vec_is_zero_16x .hidden vec_is_zero_16x .type vec_is_zero_16x,@function .align 32 vec_is_zero_16x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa shrl $4,%esi #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdi),%xmm0 leaq 16(%rdi),%rdi .Loop_is_zero: decl %esi jz .Loop_is_zero_done movdqu (%rdi),%xmm1 leaq 16(%rdi),%rdi por %xmm1,%xmm0 jmp .Loop_is_zero .Loop_is_zero_done: pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 .byte 102,72,15,126,192 incl %esi testq %rax,%rax cmovnzl %esi,%eax xorl $1,%eax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_is_zero_16x,.-vec_is_zero_16x .globl vec_is_equal_16x .hidden vec_is_equal_16x .type vec_is_equal_16x,@function .align 32 vec_is_equal_16x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa shrl $4,%edx #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm1 subq %rdi,%rsi leaq 16(%rdi),%rdi pxor %xmm1,%xmm0 .Loop_is_equal: decl %edx jz .Loop_is_equal_done movdqu (%rdi),%xmm1 movdqu (%rdi,%rsi,1),%xmm2 leaq 16(%rdi),%rdi pxor %xmm2,%xmm1 por %xmm1,%xmm0 jmp .Loop_is_equal .Loop_is_equal_done: pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 .byte 102,72,15,126,192 incl %edx testq %rax,%rax cmovnzl %edx,%eax xorl $1,%eax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size vec_is_equal_16x,.-vec_is_equal_16x .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/add_mod_384x384-x86_64.s ================================================ .text .globl add_mod_384x384 .hidden add_mod_384x384 .type add_mod_384x384,@function .align 32 add_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 addq 0(%rdx),%r8 movq 56(%rsi),%r15 adcq 8(%rdx),%r9 movq 64(%rsi),%rax adcq 16(%rdx),%r10 movq 72(%rsi),%rbx adcq 24(%rdx),%r11 movq 80(%rsi),%rbp adcq 32(%rdx),%r12 movq 88(%rsi),%rsi adcq 40(%rdx),%r13 movq %r8,0(%rdi) adcq 48(%rdx),%r14 movq %r9,8(%rdi) adcq 56(%rdx),%r15 movq %r10,16(%rdi) adcq 64(%rdx),%rax movq %r12,32(%rdi) movq %r14,%r8 adcq 72(%rdx),%rbx movq %r11,24(%rdi) movq %r15,%r9 adcq 80(%rdx),%rbp movq %r13,40(%rdi) movq %rax,%r10 adcq 88(%rdx),%rsi movq %rbx,%r11 sbbq %rdx,%rdx subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %rbp,%r12 sbbq 16(%rcx),%rax sbbq 24(%rcx),%rbx sbbq 32(%rcx),%rbp movq %rsi,%r13 sbbq 40(%rcx),%rsi sbbq $0,%rdx cmovcq %r8,%r14 cmovcq %r9,%r15 cmovcq %r10,%rax movq %r14,48(%rdi) cmovcq %r11,%rbx movq %r15,56(%rdi) cmovcq %r12,%rbp movq %rax,64(%rdi) cmovcq %r13,%rsi movq %rbx,72(%rdi) movq %rbp,80(%rdi) movq %rsi,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size add_mod_384x384,.-add_mod_384x384 .globl sub_mod_384x384 .hidden sub_mod_384x384 .type sub_mod_384x384,@function .align 32 sub_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sub_mod_384x384,.-sub_mod_384x384 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/ct_inverse_mod_256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl ct_inverse_mod_256 .hidden ct_inverse_mod_256 .type ct_inverse_mod_256, %function .align 5 ct_inverse_mod_256: hint #PACI_HINT stp x29, x30, [sp,#-10*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] sub sp, sp, #1040 ldp x4, x5, [x1,#8*0] ldp x6, x7, [x1,#8*2] #ifdef __CHERI_PURE_CAPABILITY__ add x1,sp,#16+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #16+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif str x0, [sp] // offload out_ptr ldp x8, x9, [x2,#8*0] ldp x10, x11, [x2,#8*2] stp x4, x5, [x1,#8*0] // copy input to |a| stp x6, x7, [x1,#8*2] stp x8, x9, [x1,#8*4] // copy modulus to |b| stp x10, x11, [x1,#8*6] ////////////////////////////////////////// first iteration bl .Lab_approximation_31_256_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 str x12,[x0,#8*8] // initialize |u| with |f0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 str x12, [x0,#8*10] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 ldr x8, [x1,#8*8] // |u| ldr x9, [x1,#8*14] // |v| madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| asr x5, x4, #63 // sign extension stp x4, x5, [x0,#8*4] stp x5, x5, [x0,#8*6] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| asr x5, x4, #63 // sign extension stp x4, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 asr x24, x24, #63 str x24, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 asr x24, x24, #63 // sign extension stp x24, x24, [x0,#8*4] stp x24, x24, [x0,#8*6] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail ////////////////////////////////////////// two[!] last iterations eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #47 // 31 + 512 % 31 //bl __ab_approximation_62_256 // |a| and |b| are exact, ldr x7, [x1,#8*0] // just load ldr x11, [x1,#8*4] bl __inner_loop_62_256 mov x16, x14 mov x17, x15 ldr x0, [sp] // original out_ptr bl __smul_256x63 bl __smul_512x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x20, x7, x17 // figure out top-most limb ldp x8, x9, [x3,#8*0] adc x23, x23, x25 ldp x10, x11, [x3,#8*2] add x20, x20, x23 // x20 is 1, 0 or -1 asr x19, x20, #63 // sign as mask and x23, x8, x19 // add mod<<256 conditionally and x24, x9, x19 adds x4, x4, x23 and x25, x10, x19 adcs x5, x5, x24 and x26, x11, x19 adcs x6, x6, x25 adcs x7, x22, x26 adc x20, x20, xzr // x20 is 1, 0 or -1 neg x19, x20 orr x20, x20, x19 // excess bit or sign as mask asr x19, x19, #63 // excess bit as mask and x8, x8, x20 // mask |mod| and x9, x9, x20 and x10, x10, x20 and x11, x11, x20 eor x8, x8, x19 // conditionally negate |mod| eor x9, x9, x19 adds x8, x8, x19, lsr#63 eor x10, x10, x19 adcs x9, x9, xzr eor x11, x11, x19 adcs x10, x10, xzr adc x11, x11, xzr adds x4, x4, x8 // final adjustment for |mod|<<256 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*4] adc x7, x7, x11 stp x6, x7, [x0,#8*6] add sp, sp, #1040 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldr x29, [sp],#10*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size ct_inverse_mod_256,.-ct_inverse_mod_256 //////////////////////////////////////////////////////////////////////// .type __smul_256x63, %function .align 5 __smul_256x63: ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) ldp x6, x7, [x1,#8*2+64] eor x16, x16, x14 // conditionally negate |f_| (or |g_|) ldr x22, [x1,#8*4+64] eor x4, x4, x14 // conditionally negate |u| (or |v|) sub x16, x16, x14 eor x5, x5, x14 adds x4, x4, x14, lsr#63 eor x6, x6, x14 adcs x5, x5, xzr eor x7, x7, x14 adcs x6, x6, xzr eor x22, x22, x14 umulh x19, x4, x16 adcs x7, x7, xzr umulh x20, x5, x16 adcs x22, x22, xzr umulh x21, x6, x16 mul x4, x4, x16 cmp x16, #0 mul x5, x5, x16 csel x22, x22, xzr, ne mul x6, x6, x16 adds x5, x5, x19 mul x24, x7, x16 adcs x6, x6, x20 adcs x24, x24, x21 adc x26, xzr, xzr ldp x8, x9, [x1,#8*0+112] // load |u| (or |v|) asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) ldp x10, x11, [x1,#8*2+112] eor x17, x17, x14 // conditionally negate |f_| (or |g_|) ldr x23, [x1,#8*4+112] eor x8, x8, x14 // conditionally negate |u| (or |v|) sub x17, x17, x14 eor x9, x9, x14 adds x8, x8, x14, lsr#63 eor x10, x10, x14 adcs x9, x9, xzr eor x11, x11, x14 adcs x10, x10, xzr eor x23, x23, x14 umulh x19, x8, x17 adcs x11, x11, xzr umulh x20, x9, x17 adcs x23, x23, xzr umulh x21, x10, x17 adc x15, xzr, xzr // used in __smul_512x63_tail mul x8, x8, x17 cmp x17, #0 mul x9, x9, x17 csel x23, x23, xzr, ne mul x10, x10, x17 adds x9, x9, x19 mul x25, x11, x17 adcs x10, x10, x20 adcs x25, x25, x21 adc x26, x26, xzr adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*0] adcs x24, x24, x25 stp x6, x24, [x0,#8*2] ret .size __smul_256x63,.-__smul_256x63 .type __smul_512x63_tail, %function .align 5 __smul_512x63_tail: umulh x24, x7, x16 ldr x5, [x1,#8*19] // load rest of |v| adc x26, x26, xzr ldp x6, x7, [x1,#8*20] and x22, x22, x16 umulh x11, x11, x17 // resume |v|*|g1| chain sub x24, x24, x22 // tie up |u|*|f1| chain asr x25, x24, #63 eor x5, x5, x14 // conditionally negate rest of |v| eor x6, x6, x14 adds x5, x5, x15 eor x7, x7, x14 adcs x6, x6, xzr umulh x19, x23, x17 adc x7, x7, xzr umulh x20, x5, x17 add x11, x11, x26 umulh x21, x6, x17 mul x4, x23, x17 mul x5, x5, x17 adds x4, x4, x11 mul x6, x6, x17 adcs x5, x5, x19 mul x22, x7, x17 adcs x6, x6, x20 adcs x22, x22, x21 adc x23, xzr, xzr // used in the final step adds x4, x4, x24 adcs x5, x5, x25 adcs x6, x6, x25 stp x4, x5, [x0,#8*4] adcs x22, x22, x25 // carry is used in the final step stp x6, x22, [x0,#8*6] ret .size __smul_512x63_tail,.-__smul_512x63_tail .type __smul_256_n_shift_by_31, %function .align 5 __smul_256_n_shift_by_31: ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) ldp x6, x7, [x1,#8*2+0] eor x25, x12, x24 // conditionally negate |f0| (or |g0|) eor x4, x4, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x5, x5, x24 adds x4, x4, x24, lsr#63 eor x6, x6, x24 adcs x5, x5, xzr eor x7, x7, x24 umulh x19, x4, x25 adcs x6, x6, xzr umulh x20, x5, x25 adc x7, x7, xzr umulh x21, x6, x25 and x24, x24, x25 umulh x22, x7, x25 neg x24, x24 mul x4, x4, x25 mul x5, x5, x25 mul x6, x6, x25 adds x5, x5, x19 mul x7, x7, x25 adcs x6, x6, x20 adcs x7, x7, x21 adc x22, x22, x24 ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) ldp x10, x11, [x1,#8*2+32] eor x25, x13, x24 // conditionally negate |f0| (or |g0|) eor x8, x8, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x9, x9, x24 adds x8, x8, x24, lsr#63 eor x10, x10, x24 adcs x9, x9, xzr eor x11, x11, x24 umulh x19, x8, x25 adcs x10, x10, xzr umulh x20, x9, x25 adc x11, x11, xzr umulh x21, x10, x25 and x24, x24, x25 umulh x23, x11, x25 neg x24, x24 mul x8, x8, x25 mul x9, x9, x25 mul x10, x10, x25 adds x9, x9, x19 mul x11, x11, x25 adcs x10, x10, x20 adcs x11, x11, x21 adc x23, x23, x24 adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 adcs x7, x7, x11 adc x8, x22, x23 extr x4, x5, x4, #31 extr x5, x6, x5, #31 extr x6, x7, x6, #31 asr x23, x8, #63 // result's sign as mask extr x7, x8, x7, #31 eor x4, x4, x23 // ensure the result is positive eor x5, x5, x23 adds x4, x4, x23, lsr#63 eor x6, x6, x23 adcs x5, x5, xzr eor x7, x7, x23 adcs x6, x6, xzr stp x4, x5, [x0,#8*0] adc x7, x7, xzr stp x6, x7, [x0,#8*2] eor x12, x12, x23 // adjust |f/g| accordingly eor x13, x13, x23 sub x12, x12, x23 sub x13, x13, x23 ret .size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 .type __ab_approximation_31_256, %function .align 4 __ab_approximation_31_256: ldp x6, x7, [x1,#8*2] ldp x10, x11, [x1,#8*6] ldp x4, x5, [x1,#8*0] ldp x8, x9, [x1,#8*4] .Lab_approximation_31_256_loaded: orr x19, x7, x11 // check top-most limbs, ... cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x5, ne orr x19, x7, x11 // and ones before top-most, ... csel x10, x10, x9, ne cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x4, ne orr x19, x7, x11 // and one more, ... csel x10, x10, x8, ne clz x19, x19 cmp x19, #64 csel x19, x19, xzr, ne csel x7, x7, x6, ne csel x11, x11, x10, ne neg x20, x19 lslv x7, x7, x19 // align high limbs to the left lslv x11, x11, x19 lsrv x6, x6, x20 lsrv x10, x10, x20 and x6, x6, x20, asr#6 and x10, x10, x20, asr#6 orr x7, x7, x6 orr x11, x11, x10 bfxil x7, x4, #0, #31 bfxil x11, x8, #0, #31 b __inner_loop_31_256 ret .size __ab_approximation_31_256,.-__ab_approximation_31_256 .type __inner_loop_31_256, %function .align 4 __inner_loop_31_256: mov x2, #31 mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x23,#0x7FFFFFFF7FFFFFFF .Loop_31_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x15 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x15, x15, x13, hs // exchange |fg0| and |fg1| csel x13, x13, x19, hs lsr x7, x7, #1 and x19, x15, x22 and x20, x23, x22 sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x15, x15, x15 // |f1|<<=1 add x13, x13, x20 sub x15, x15, x23 cbnz x2, .Loop_31_256 mov x23, #0x7FFFFFFF ubfx x12, x13, #0, #32 ubfx x13, x13, #32, #32 ubfx x14, x15, #0, #32 ubfx x15, x15, #32, #32 sub x12, x12, x23 // remove bias sub x13, x13, x23 sub x14, x14, x23 sub x15, x15, x23 ret .size __inner_loop_31_256,.-__inner_loop_31_256 .type __inner_loop_62_256, %function .align 4 __inner_loop_62_256: mov x12, #1 // |f0|=1 mov x13, #0 // |g0|=0 mov x14, #0 // |f1|=0 mov x15, #1 // |g1|=1 .Loop_62_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x12 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| mov x20, x13 csel x12, x12, x14, hs // exchange |f0| and |f1| csel x14, x14, x19, hs csel x13, x13, x15, hs // exchange |g0| and |g1| csel x15, x15, x20, hs lsr x7, x7, #1 and x19, x14, x22 and x20, x15, x22 add x14, x14, x14 // |f1|<<=1 add x15, x15, x15 // |g1|<<=1 sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, .Loop_62_256 ret .size __inner_loop_62_256,.-__inner_loop_62_256 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/ct_inverse_mod_256-x86_64.s ================================================ .text .globl ct_inverse_mod_256 .hidden ct_inverse_mod_256 .type ct_inverse_mod_256,@function .align 32 ct_inverse_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $1072,%rsp .cfi_adjust_cfa_offset 1072 leaq 48+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 0(%rdx),%r12 movq 8(%rdx),%r13 movq 16(%rdx),%r14 movq 24(%rdx),%r15 movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rax,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,64(%rdi) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,72(%rdi) xorq $256,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq 64(%rsi),%r8 movq 104(%rsi),%r12 movq %r8,%r9 imulq 0(%rsp),%r8 movq %r12,%r13 imulq 8(%rsp),%r12 addq %r12,%r8 movq %r8,32(%rdi) sarq $63,%r8 movq %r8,40(%rdi) movq %r8,48(%rdi) movq %r8,56(%rdi) movq %r8,64(%rdi) leaq 64(%rsi),%rsi imulq %rdx,%r9 imulq %rcx,%r13 addq %r13,%r9 movq %r9,72(%rdi) sarq $63,%r9 movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) movq %r9,104(%rdi) xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 sarq $63,%rbp movq %rbp,40(%rdi) movq %rbp,48(%rdi) movq %rbp,56(%rdi) xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $47,%edx movq 0(%rsi),%r8 movq 32(%rsi),%r10 call __inner_loop_62_256 leaq 64(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulq_512x63 adcq %rbp,%rdx movq 40(%rsp),%rsi movq %rdx,%rax sarq $63,%rdx movq %rdx,%r8 movq %rdx,%r9 #ifdef __SGX_LVI_HARDENING__ lfence #endif andq 0(%rsi),%r8 movq %rdx,%r10 andq 8(%rsi),%r9 andq 16(%rsi),%r10 andq 24(%rsi),%rdx addq %r8,%r12 adcq %r9,%r13 adcq %r10,%r14 adcq %rdx,%r15 adcq $0,%rax movq %rax,%rdx negq %rax orq %rax,%rdx sarq $63,%rax movq %rdx,%r8 movq %rdx,%r9 andq 0(%rsi),%r8 movq %rdx,%r10 andq 8(%rsi),%r9 andq 16(%rsi),%r10 andq 24(%rsi),%rdx xorq %rax,%r8 xorq %rcx,%rcx xorq %rax,%r9 subq %rax,%rcx xorq %rax,%r10 xorq %rax,%rdx addq %rcx,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%rdx addq %r8,%r12 adcq %r9,%r13 adcq %r10,%r14 adcq %rdx,%r15 movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 1072(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -1072-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size ct_inverse_mod_256,.-ct_inverse_mod_256 .type __smulq_512x63,@function .align 32 __smulq_512x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%rbp movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%rbp addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%rbp mulq %rbx movq %rax,0(%rdi) movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %r9,8(%rdi) movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %r10,16(%rdi) movq %rdx,%r11 andq %rbx,%rbp negq %rbp mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq %r11,24(%rdi) movq 40(%rsi),%r8 movq 48(%rsi),%r9 movq 56(%rsi),%r10 movq 64(%rsi),%r11 movq 72(%rsi),%r12 movq 80(%rsi),%r13 movq 88(%rsi),%r14 movq 96(%rsi),%r15 movq %rcx,%rdx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rcx addq %rax,%rcx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 mulq %rcx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rcx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rcx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rcx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rcx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rcx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rcx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 imulq %rcx addq %rax,%r15 adcq $0,%rdx movq %rbp,%rbx sarq $63,%rbp addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq %rbx,%r12 adcq %rbp,%r13 adcq %rbp,%r14 adcq %rbp,%r15 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulq_512x63,.-__smulq_512x63 .type __smulq_256x63,@function .align 32 __smulq_256x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%rbp movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%rbp addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%rbp mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 andq %rbx,%rbp negq %rbp mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq %rcx,%rdx movq 40+0(%rsi),%r12 movq 40+8(%rsi),%r13 movq 40+16(%rsi),%r14 movq 40+24(%rsi),%r15 movq 40+32(%rsi),%rcx movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 xorq %rdx,%rcx addq %r12,%rax adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rcx mulq %rbx movq %rax,%r12 movq %r13,%rax movq %rdx,%r13 mulq %rbx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rbx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 andq %rbx,%rcx negq %rcx mulq %rbx addq %rax,%r15 adcq %rdx,%rcx addq %r12,%r8 adcq %r13,%r9 adcq %r14,%r10 adcq %r15,%r11 adcq %rcx,%rbp movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %rbp,32(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulq_256x63,.-__smulq_256x63 .type __smulq_256_n_shift_by_31,@function .align 32 __smulq_256_n_shift_by_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,0(%rdi) movq %rcx,8(%rdi) movq %rdx,%rbp movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq %rbp,%rbx sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rbx addq %rax,%rbx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 mulq %rbx movq %rax,%r8 movq %r9,%rax andq %rbx,%rbp negq %rbp movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r14 movq 32+24(%rsi),%r15 movq %rcx,%rbx sarq $63,%rcx xorq %rax,%rax subq %rcx,%rax xorq %rcx,%rbx addq %rax,%rbx xorq %rcx,%r12 xorq %rcx,%r13 xorq %rcx,%r14 xorq %rcx,%r15 addq %r12,%rax adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 mulq %rbx movq %rax,%r12 movq %r13,%rax andq %rbx,%rcx negq %rcx movq %rdx,%r13 mulq %rbx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rbx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rbx addq %rax,%r15 adcq %rdx,%rcx addq %r12,%r8 adcq %r13,%r9 adcq %r14,%r10 adcq %r15,%r11 adcq %rcx,%rbp movq 0(%rdi),%rdx movq 8(%rdi),%rcx shrdq $31,%r9,%r8 shrdq $31,%r10,%r9 shrdq $31,%r11,%r10 shrdq $31,%rbp,%r11 sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) xorq %rbp,%rdx xorq %rbp,%rcx addq %rax,%rdx addq %rax,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 .type __ab_approximation_31_256,@function .align 32 __ab_approximation_31_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 24(%rsi),%r9 movq 56(%rsi),%r11 movq 16(%rsi),%rbx movq 48(%rsi),%rbp movq 8(%rsi),%r8 movq 40(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 0(%rsi),%r8 cmovzq %r10,%rbp movq 32(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r9 cmovzq %r10,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 movl $0x7FFFFFFF,%eax andq %rax,%r8 andq %rax,%r10 notq %rax andq %rax,%r9 andq %rax,%r11 orq %r9,%r8 orq %r11,%r10 jmp __inner_loop_31_256 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __ab_approximation_31_256,.-__ab_approximation_31_256 .type __inner_loop_31_256,@function .align 32 __inner_loop_31_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rcx movq $0x800000007FFFFFFF,%r13 movq $0x7FFFFFFF7FFFFFFF,%r15 .Loop_31_256: cmpq %r10,%r8 movq %r8,%rax movq %r10,%rbx movq %rcx,%rbp movq %r13,%r14 cmovbq %r10,%r8 cmovbq %rax,%r10 cmovbq %r13,%rcx cmovbq %rbp,%r13 subq %r10,%r8 subq %r13,%rcx addq %r15,%rcx testq $1,%rax cmovzq %rax,%r8 cmovzq %rbx,%r10 cmovzq %rbp,%rcx cmovzq %r14,%r13 shrq $1,%r8 addq %r13,%r13 subq %r15,%r13 subl $1,%edx jnz .Loop_31_256 shrq $32,%r15 movl %ecx,%edx movl %r13d,%r12d shrq $32,%rcx shrq $32,%r13 subq %r15,%rdx subq %r15,%rcx subq %r15,%r12 subq %r15,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __inner_loop_31_256,.-__inner_loop_31_256 .type __inner_loop_62_256,@function .align 32 __inner_loop_62_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movl %edx,%r15d movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq %rdx,%r13 movq %rdx,%r14 .Loop_62_256: xorq %rax,%rax testq %r14,%r8 movq %r10,%rbx cmovnzq %r10,%rax subq %r8,%rbx movq %r8,%rbp subq %rax,%r8 cmovcq %rbx,%r8 cmovcq %rbp,%r10 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrq $1,%r8 testq %r14,%rbp cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%r15d jnz .Loop_62_256 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __inner_loop_62_256,.-__inner_loop_62_256 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/ct_inverse_mod_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl ct_inverse_mod_384 .hidden ct_inverse_mod_384 .type ct_inverse_mod_384, %function .align 5 ct_inverse_mod_384: hint #PACI_HINT stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #1056 ldp x22, x4, [x1,#8*0] ldp x5, x6, [x1,#8*2] ldp x7, x8, [x1,#8*4] #ifdef __CHERI_PURE_CAPABILITY__ add x1,sp,#32+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #32+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif stp x0, x3, [sp] // offload out_ptr, nx_ptr ldp x9, x10, [x2,#8*0] ldp x11, x12, [x2,#8*2] ldp x13, x14, [x2,#8*4] stp x22, x4, [x1,#8*0] // copy input to |a| stp x5, x6, [x1,#8*2] stp x7, x8, [x1,#8*4] stp x9, x10, [x1,#8*6] // copy modulus to |b| stp x11, x12, [x1,#8*8] stp x13, x14, [x1,#8*10] ////////////////////////////////////////// first iteration mov x2, #62 bl .Lab_approximation_62_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 str x15,[x0,#8*12] // initialize |u| with |f0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 str x15, [x0,#8*14] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 ldr x7, [x1,#8*12] // |u| ldr x8, [x1,#8*20] // |v| mul x3, x20, x7 // |u|*|f0| smulh x4, x20, x7 mul x5, x21, x8 // |v|*|g0| smulh x6, x21, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] mul x3, x15, x7 // |u|*|f1| smulh x4, x15, x7 mul x5, x16, x8 // |v|*|g1| smulh x6, x16, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*14] asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*16] stp x5, x5, [x0,#8*18] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 asr x27, x27, #63 str x27, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 asr x27, x27, #63 // sign extension stp x27, x27, [x0,#8*6] stp x27, x27, [x0,#8*8] stp x27, x27, [x0,#8*10] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// iteration before last eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 //bl __ab_approximation_62 // |a| and |b| are exact, ldp x3, x8, [x1,#8*0] // just load ldp x9, x14, [x1,#8*6] bl __inner_loop_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif str x3, [x0,#8*0] str x9, [x0,#8*6] mov x20, x15 // exact |f0| mov x21, x16 // exact |g0| mov x15, x17 mov x16, x19 add x0,x0,#8*12 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // exact |f1| mov x21, x16 // exact |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// last iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #24 // 768 % 62 //bl __ab_approximation_62 // |a| and |b| are exact, ldr x3, [x1,#8*0] // just load eor x8, x8, x8 ldr x9, [x1,#8*6] eor x14, x14, x14 bl __inner_loop_62 mov x20, x17 mov x21, x19 ldp x0, x15, [sp] // original out_ptr and n_ptr bl __smul_384x63 bl __smul_768x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x23, x8, x21 // figure out top-most limb adc x26, x26, x28 ldp x9, x10, [x15,#8*0] // load |mod| add x23, x23, x26 // x23 is 1, 0 or -1 ldp x11, x12, [x15,#8*2] asr x22, x23, #63 // sign as mask ldp x13, x14, [x15,#8*4] and x26, x9, x22 // add mod<<384 conditionally and x27, x10, x22 adds x3, x3, x26 and x28, x11, x22 adcs x4, x4, x27 and x2, x12, x22 adcs x5, x5, x28 and x26, x13, x22 adcs x6, x6, x2 and x27, x14, x22 adcs x7, x7, x26 adcs x8, x25, x27 adc x23, x23, xzr // x23 is 1, 0 or -1 neg x22, x23 orr x23, x23, x22 // excess bit or sign as mask asr x22, x22, #63 // excess bit as mask and x9, x9, x23 // mask |mod| and x10, x10, x23 and x11, x11, x23 and x12, x12, x23 and x13, x13, x23 and x14, x14, x23 eor x9, x9, x22 // conditionally negate |mod| eor x10, x10, x22 adds x9, x9, x22, lsr#63 eor x11, x11, x22 adcs x10, x10, xzr eor x12, x12, x22 adcs x11, x11, xzr eor x13, x13, x22 adcs x12, x12, xzr eor x14, x14, x22 adcs x13, x13, xzr adc x14, x14, xzr adds x3, x3, x9 // final adjustment for |mod|<<384 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*6] adcs x7, x7, x13 stp x5, x6, [x0,#8*8] adc x8, x8, x14 stp x7, x8, [x0,#8*10] add sp, sp, #1056 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size ct_inverse_mod_384,.-ct_inverse_mod_384 //////////////////////////////////////////////////////////////////////// // see corresponding commentary in ctx_inverse_mod_384-x86_64... .type __smul_384x63, %function .align 5 __smul_384x63: ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) ldp x5, x6, [x1,#8*2+96] eor x20, x20, x17 // conditionally negate |f_| (or |g_|) ldp x7, x8, [x1,#8*4+96] eor x3, x3, x17 // conditionally negate |u| (or |v|) ldr x25, [x1,#8*6+96] sub x20, x20, x17 eor x4, x4, x17 adds x3, x3, x17, lsr#63 eor x5, x5, x17 adcs x4, x4, xzr eor x6, x6, x17 adcs x5, x5, xzr eor x7, x7, x17 adcs x6, x6, xzr umulh x22, x3, x20 eor x8, x8, x17 umulh x23, x4, x20 adcs x7, x7, xzr umulh x24, x5, x20 eor x25, x25, x17 mul x3, x3, x20 adcs x8, x8, xzr mul x4, x4, x20 adcs x25, x25, xzr cmp x20, #0 mul x5, x5, x20 csel x25, x25, xzr, ne adds x4, x4, x22 umulh x22, x6, x20 adcs x5, x5, x23 umulh x23, x7, x20 mul x6, x6, x20 mul x7, x7, x20 adcs x6, x6, x24 mul x27,x8, x20 adcs x7, x7, x22 adcs x27,x27,x23 adc x2, xzr, xzr ldp x9, x10, [x1,#8*0+160] // load |u| (or |v|) asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) ldp x11, x12, [x1,#8*2+160] eor x21, x21, x17 // conditionally negate |f_| (or |g_|) ldp x13, x14, [x1,#8*4+160] eor x9, x9, x17 // conditionally negate |u| (or |v|) ldr x26, [x1,#8*6+160] sub x21, x21, x17 eor x10, x10, x17 adds x9, x9, x17, lsr#63 eor x11, x11, x17 adcs x10, x10, xzr eor x12, x12, x17 adcs x11, x11, xzr eor x13, x13, x17 adcs x12, x12, xzr umulh x22, x9, x21 eor x14, x14, x17 umulh x23, x10, x21 adcs x13, x13, xzr umulh x24, x11, x21 eor x26, x26, x17 mul x9, x9, x21 adcs x14, x14, xzr mul x10, x10, x21 adcs x26, x26, xzr adc x19, xzr, xzr // used in __smul_768x63_tail cmp x21, #0 mul x11, x11, x21 csel x26, x26, xzr, ne adds x10, x10, x22 umulh x22, x12, x21 adcs x11, x11, x23 umulh x23, x13, x21 mul x12, x12, x21 mul x13, x13, x21 adcs x12, x12, x24 mul x28,x14, x21 adcs x13, x13, x22 adcs x28,x28,x23 adc x2, x2, xzr adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*0] adcs x7, x7, x13 stp x5, x6, [x0,#8*2] adcs x27, x27, x28 stp x7, x27, [x0,#8*4] ret .size __smul_384x63,.-__smul_384x63 .type __smul_768x63_tail, %function .align 5 __smul_768x63_tail: umulh x27, x8, x20 ldr x4, [x1,#8*27]// load rest of |v| adc x2, x2, xzr ldp x5, x6, [x1,#8*28] and x25, x25, x20 ldp x7, x8, [x1,#8*30] sub x27, x27, x25 // tie up |u|*|f1| chain umulh x14, x14, x21 // resume |v|*|g1| chain eor x4, x4, x17 // conditionally negate rest of |v| eor x5, x5, x17 eor x6, x6, x17 adds x4, x4, x19 eor x7, x7, x17 adcs x5, x5, xzr eor x8, x8, x17 adcs x6, x6, xzr umulh x22, x26, x21 adcs x7, x7, xzr umulh x23, x4, x21 adc x8, x8, xzr umulh x24, x5, x21 add x14, x14, x2 umulh x25, x6, x21 asr x28, x27, #63 umulh x2, x7, x21 mul x3, x26, x21 mul x4, x4, x21 mul x5, x5, x21 adds x3, x3, x14 mul x6, x6, x21 adcs x4, x4, x22 mul x7, x7, x21 adcs x5, x5, x23 mul x22, x8, x21 adcs x6, x6, x24 adcs x7, x7, x25 adcs x25, x22, x2 adc x26, xzr, xzr // used in the final step adds x3, x3, x27 adcs x4, x4, x28 adcs x5, x5, x28 adcs x6, x6, x28 stp x3, x4, [x0,#8*6] adcs x7, x7, x28 stp x5, x6, [x0,#8*8] adcs x25, x25, x28 // carry is used in the final step stp x7, x25, [x0,#8*10] ret .size __smul_768x63_tail,.-__smul_768x63_tail .type __smul_384_n_shift_by_62, %function .align 5 __smul_384_n_shift_by_62: ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) ldp x5, x6, [x1,#8*2+0] eor x2, x15, x28 // conditionally negate |f0| (or |g0|) ldp x7, x8, [x1,#8*4+0] eor x3, x3, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 umulh x22, x3, x2 adcs x6, x6, xzr umulh x23, x4, x2 eor x8, x8, x28 mul x3, x3, x2 adcs x7, x7, xzr mul x4, x4, x2 adc x8, x8, xzr umulh x24, x5, x2 and x28, x28, x2 umulh x25, x6, x2 adds x4, x4, x22 mul x5, x5, x2 umulh x22, x7, x2 neg x28, x28 mul x6, x6, x2 adcs x5, x5, x23 umulh x23, x8, x2 mul x7, x7, x2 adcs x6, x6, x24 mul x8, x8, x2 adcs x7, x7, x25 adcs x8, x8, x22 adc x27, x23, x28 ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) ldp x11, x12, [x1,#8*2+48] eor x2, x16, x28 // conditionally negate |f0| (or |g0|) ldp x13, x14, [x1,#8*4+48] eor x9, x9, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x10, x10, x28 adds x9, x9, x28, lsr#63 eor x11, x11, x28 adcs x10, x10, xzr eor x12, x12, x28 adcs x11, x11, xzr eor x13, x13, x28 umulh x22, x9, x2 adcs x12, x12, xzr umulh x23, x10, x2 eor x14, x14, x28 mul x9, x9, x2 adcs x13, x13, xzr mul x10, x10, x2 adc x14, x14, xzr umulh x24, x11, x2 and x28, x28, x2 umulh x25, x12, x2 adds x10, x10, x22 mul x11, x11, x2 umulh x22, x13, x2 neg x28, x28 mul x12, x12, x2 adcs x11, x11, x23 umulh x23, x14, x2 mul x13, x13, x2 adcs x12, x12, x24 mul x14, x14, x2 adcs x13, x13, x25 adcs x14, x14, x22 adc x28, x23, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x27, x28 extr x3, x4, x3, #62 extr x4, x5, x4, #62 extr x5, x6, x5, #62 asr x28, x9, #63 extr x6, x7, x6, #62 extr x7, x8, x7, #62 extr x8, x9, x8, #62 eor x3, x3, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 adcs x6, x6, xzr eor x8, x8, x28 stp x3, x4, [x0,#8*0] adcs x7, x7, xzr stp x5, x6, [x0,#8*2] adc x8, x8, xzr stp x7, x8, [x0,#8*4] eor x15, x15, x28 eor x16, x16, x28 sub x15, x15, x28 sub x16, x16, x28 ret .size __smul_384_n_shift_by_62,.-__smul_384_n_shift_by_62 .type __ab_approximation_62, %function .align 4 __ab_approximation_62: ldp x7, x8, [x1,#8*4] ldp x13, x14, [x1,#8*10] ldp x5, x6, [x1,#8*2] ldp x11, x12, [x1,#8*8] .Lab_approximation_62_loaded: orr x22, x8, x14 // check top-most limbs, ... cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x22, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne ldp x3, x4, [x1,#8*0] ldp x9, x10, [x1,#8*6] cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x22, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x22, x8, x14 csel x13, x13, x10, ne clz x22, x22 cmp x22, #64 csel x22, x22, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x23, x22 lslv x8, x8, x22 // align high limbs to the left lslv x14, x14, x22 lsrv x7, x7, x23 lsrv x13, x13, x23 and x7, x7, x23, asr#6 and x13, x13, x23, asr#6 orr x8, x8, x7 orr x14, x14, x13 b __inner_loop_62 ret .size __ab_approximation_62,.-__ab_approximation_62 .type __inner_loop_62, %function .align 4 __inner_loop_62: mov x15, #1 // |f0|=1 mov x16, #0 // |g0|=0 mov x17, #0 // |f1|=0 mov x19, #1 // |g1|=1 .Loop_62: sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 subs x24, x9, x3 // |b_|-|a_| and x22, x9, x28 sbc x25, x14, x8 and x23, x14, x28 subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x22, x15 sbcs x27, x8, x23 mov x23, x16 csel x9, x9, x3, hs // |b_| = |a_| csel x14, x14, x8, hs csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x8, x27, x25, hs csel x15, x15, x17, hs // exchange |f0| and |f1| csel x17, x17, x22, hs csel x16, x16, x19, hs // exchange |g0| and |g1| csel x19, x19, x23, hs extr x3, x8, x3, #1 lsr x8, x8, #1 and x22, x17, x28 and x23, x19, x28 add x17, x17, x17 // |f1|<<=1 add x19, x19, x19 // |g1|<<=1 sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, .Loop_62 ret .size __inner_loop_62,.-__inner_loop_62 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/ct_is_square_mod_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl ct_is_square_mod_384 .hidden ct_is_square_mod_384 .type ct_is_square_mod_384, %function .align 5 ct_is_square_mod_384: hint #PACI_HINT stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #512 ldp x3, x4, [x0,#8*0] // load input ldp x5, x6, [x0,#8*2] ldp x7, x8, [x0,#8*4] add x0, sp, #255 // find closest 256-byte-aligned spot and x0, x0, #-256 // in the frame... #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif ldp x9, x10, [x1,#8*0] // load modulus ldp x11, x12, [x1,#8*2] ldp x13, x14, [x1,#8*4] stp x3, x4, [x0,#8*6] // copy input to |a| stp x5, x6, [x0,#8*8] stp x7, x8, [x0,#8*10] stp x9, x10, [x0,#8*0] // copy modulus to |b| stp x11, x12, [x0,#8*2] stp x13, x14, [x0,#8*4] eor x2, x2, x2 // init the .Legendre symbol mov x15, #24 // 24 is 768/30-1 b .Loop_is_square .align 4 .Loop_is_square: bl __ab_approximation_30 sub x15, x15, #1 eor x1, x0, #128 // pointer to dst |b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,csp,x1 #endif bl __smul_384_n_shift_by_30 mov x19, x16 // |f0| mov x20, x17 // |g0| add x1,x1,#8*6 bl __smul_384_n_shift_by_30 ldp x9, x10, [x1,#-8*6] eor x0, x0, #128 // flip-flop src |a|b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif and x27, x27, x9 // if |a| was negative, add x2, x2, x27, lsr#1 // adjust |L| cbnz x15, .Loop_is_square ////////////////////////////////////////// last iteration //bl __ab_approximation_30 // |a| and |b| are exact, //ldr x8, [x0,#8*6] // and loaded //ldr x14, [x0,#8*0] mov x15, #48 // 48 is 768%30 + 30 bl __inner_loop_48 ldr x30, [x29,#__SIZEOF_POINTER__] and x0, x2, #1 eor x0, x0, #1 add sp, sp, #512 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size ct_is_square_mod_384,.-ct_is_square_mod_384 .type __smul_384_n_shift_by_30, %function .align 5 __smul_384_n_shift_by_30: ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) ldp x5, x6, [x0,#8*2+0] eor x20, x20, x27 // conditionally negate |g1| (or |f1|) ldp x7, x8, [x0,#8*4+0] eor x3, x3, x27 // conditionally negate |b| (or |a|) sub x20, x20, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 umulh x21, x3, x20 adcs x6, x6, xzr umulh x22, x4, x20 eor x8, x8, x27 umulh x23, x5, x20 adcs x7, x7, xzr umulh x24, x6, x20 adc x8, x8, xzr umulh x25, x7, x20 and x28, x20, x27 umulh x26, x8, x20 neg x28, x28 mul x3, x3, x20 mul x4, x4, x20 mul x5, x5, x20 adds x4, x4, x21 mul x6, x6, x20 adcs x5, x5, x22 mul x7, x7, x20 adcs x6, x6, x23 mul x8, x8, x20 adcs x7, x7, x24 adcs x8, x8 ,x25 adc x26, x26, x28 ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) ldp x11, x12, [x0,#8*2+48] eor x19, x19, x27 // conditionally negate |g1| (or |f1|) ldp x13, x14, [x0,#8*4+48] eor x9, x9, x27 // conditionally negate |b| (or |a|) sub x19, x19, x27 eor x10, x10, x27 adds x9, x9, x27, lsr#63 eor x11, x11, x27 adcs x10, x10, xzr eor x12, x12, x27 adcs x11, x11, xzr eor x13, x13, x27 umulh x21, x9, x19 adcs x12, x12, xzr umulh x22, x10, x19 eor x14, x14, x27 umulh x23, x11, x19 adcs x13, x13, xzr umulh x24, x12, x19 adc x14, x14, xzr umulh x25, x13, x19 and x28, x19, x27 umulh x27, x14, x19 neg x28, x28 mul x9, x9, x19 mul x10, x10, x19 mul x11, x11, x19 adds x10, x10, x21 mul x12, x12, x19 adcs x11, x11, x22 mul x13, x13, x19 adcs x12, x12, x23 mul x14, x14, x19 adcs x13, x13, x24 adcs x14, x14 ,x25 adc x27, x27, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x26, x27 extr x3, x4, x3, #30 extr x4, x5, x4, #30 extr x5, x6, x5, #30 asr x27, x9, #63 extr x6, x7, x6, #30 extr x7, x8, x7, #30 extr x8, x9, x8, #30 eor x3, x3, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 adcs x6, x6, xzr eor x8, x8, x27 stp x3, x4, [x1,#8*0] adcs x7, x7, xzr stp x5, x6, [x1,#8*2] adc x8, x8, xzr stp x7, x8, [x1,#8*4] ret .size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 .type __ab_approximation_30, %function .align 4 __ab_approximation_30: ldp x13, x14, [x0,#8*4] // |a| is still in registers ldp x11, x12, [x0,#8*2] orr x21, x8, x14 // check top-most limbs, ... cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x21, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x21, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x21, x8, x14 // and one more, ... csel x13, x13, x10, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x3, ne orr x21, x8, x14 csel x13, x13, x9, ne clz x21, x21 cmp x21, #64 csel x21, x21, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x22, x21 lslv x8, x8, x21 // align high limbs to the left lslv x14, x14, x21 lsrv x7, x7, x22 lsrv x13, x13, x22 and x7, x7, x22, asr#6 and x13, x13, x22, asr#6 orr x8, x8, x7 orr x14, x14, x13 bfxil x8, x3, #0, #32 bfxil x14, x9, #0, #32 b __inner_loop_30 ret .size __ab_approximation_30,.-__ab_approximation_30 .type __inner_loop_30, %function .align 4 __inner_loop_30: mov x28, #30 mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x27,#0x7FFFFFFF7FFFFFFF .Loop_30: sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x8, x14 sub x28, x28, #1 and x21, x14, x24 sub x22, x14, x8 // |b_|-|a_| subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 mov x21, x20 csel x14, x14, x8, hs // |b_| = |a_| csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x20, x20, x17, hs // exchange |fg0| and |fg1| csel x17, x17, x21, hs csel x2, x2, x25, hs lsr x8, x8, #1 and x21, x20, x24 and x22, x27, x24 add x23, x14, #2 sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x20, x20, x20 // |f1|<<=1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 add x17, x17, x22 sub x20, x20, x27 cbnz x28, .Loop_30 mov x27, #0x7FFFFFFF ubfx x16, x17, #0, #32 ubfx x17, x17, #32, #32 ubfx x19, x20, #0, #32 ubfx x20, x20, #32, #32 sub x16, x16, x27 // remove the bias sub x17, x17, x27 sub x19, x19, x27 sub x20, x20, x27 ret .size __inner_loop_30,.-__inner_loop_30 .type __inner_loop_48, %function .align 4 __inner_loop_48: .Loop_48: sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x3, x9 sub x15, x15, #1 and x21, x9, x24 sub x22, x9, x3 // |b_|-|a_| subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 csel x9, x9, x3, hs // |b_| = |a_| csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x2, x2, x25, hs add x23, x9, #2 lsr x3, x3, #1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 cbnz x15, .Loop_48 ret .size __inner_loop_48,.-__inner_loop_48 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/ct_is_square_mod_384-x86_64.s ================================================ .text .globl ct_is_square_mod_384 .hidden ct_is_square_mod_384 .type ct_is_square_mod_384,@function .align 32 ct_is_square_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $536,%rsp .cfi_adjust_cfa_offset 536 leaq 24+255(%rsp),%rax andq $-256,%rax #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rbx movq 24(%rsi),%rcx movq 32(%rsi),%rdx movq 40(%rsi),%rdi movq %rax,%rsi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rcx,72(%rax) movq %rdx,80(%rax) movq %rdi,88(%rax) xorq %rbp,%rbp movl $24,%ecx jmp .Loop_is_square .align 32 .Loop_is_square: movl %ecx,16(%rsp) call __ab_approximation_30 movq %rax,0(%rsp) movq %rbx,8(%rsp) movq $128+48,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_30 movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq -48(%rdi),%rdi call __smulq_384_n_shift_by_30 movl 16(%rsp),%ecx xorq $128,%rsi andq 48(%rdi),%r14 shrq $1,%r14 addq %r14,%rbp subl $1,%ecx jnz .Loop_is_square movq 48(%rsi),%r9 call __inner_loop_48 movq $1,%rax andq %rbp,%rax xorq $1,%rax leaq 536(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -536-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size ct_is_square_mod_384,.-ct_is_square_mod_384 .type __smulq_384_n_shift_by_30,@function .align 32 __smulq_384_n_shift_by_30: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %rdx,%r14 andq %rbx,%r14 mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %r14 mulq %rbx addq %rax,%r13 adcq %rdx,%r14 leaq 48(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %rdx,%r15 andq %rbx,%r15 mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %r15 mulq %rbx addq %rax,%r13 adcq %rdx,%r15 leaq -48(%rsi),%rsi addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 shrdq $30,%r9,%r8 shrdq $30,%r10,%r9 shrdq $30,%r11,%r10 shrdq $30,%r12,%r11 shrdq $30,%r13,%r12 shrdq $30,%r14,%r13 sarq $63,%r14 xorq %rbx,%rbx subq %r14,%rbx xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbx,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 .type __ab_approximation_30,@function .align 32 __ab_approximation_30: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 88(%rsi),%rbx movq 80(%rsi),%r15 movq 72(%rsi),%r14 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r11,%r12 movq 64(%rsi),%r11 cmovzq %r14,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r10,%r12 movq 56(%rsi),%r10 cmovzq %r11,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r9,%r12 movq 48(%rsi),%r9 cmovzq %r10,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r8,%r12 cmovzq %r9,%r15 movq %r13,%rax orq %rbx,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r13 cmovzq %r9,%rbx cmovzq %rax,%rcx negq %rcx shldq %cl,%r12,%r13 shldq %cl,%r15,%rbx movq $0xFFFFFFFF00000000,%rax movl %r8d,%r8d movl %r9d,%r9d andq %rax,%r13 andq %rax,%rbx orq %r13,%r8 orq %rbx,%r9 jmp __inner_loop_30 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __ab_approximation_30,.-__ab_approximation_30 .type __inner_loop_30,@function .align 32 __inner_loop_30: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rbx movq $0x800000007FFFFFFF,%rcx leaq -1(%rbx),%r15 movl $30,%edi .Loop_30: movq %r8,%rax andq %r9,%rax shrq $1,%rax cmpq %r9,%r8 movq %r8,%r10 movq %r9,%r11 leaq (%rax,%rbp,1),%rax movq %rbx,%r12 movq %rcx,%r13 movq %rbp,%r14 cmovbq %r9,%r8 cmovbq %r10,%r9 cmovbq %rcx,%rbx cmovbq %r12,%rcx cmovbq %rax,%rbp subq %r9,%r8 subq %rcx,%rbx addq %r15,%rbx testq $1,%r10 cmovzq %r10,%r8 cmovzq %r11,%r9 cmovzq %r12,%rbx cmovzq %r13,%rcx cmovzq %r14,%rbp leaq 2(%r9),%rax shrq $1,%r8 shrq $2,%rax addq %rcx,%rcx leaq (%rax,%rbp,1),%rbp subq %r15,%rcx subl $1,%edi jnz .Loop_30 shrq $32,%r15 movl %ebx,%eax shrq $32,%rbx movl %ecx,%edx shrq $32,%rcx subq %r15,%rax subq %r15,%rbx subq %r15,%rdx subq %r15,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __inner_loop_30,.-__inner_loop_30 .type __inner_loop_48,@function .align 32 __inner_loop_48: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movl $48,%edi .Loop_48: movq %r8,%rax andq %r9,%rax shrq $1,%rax cmpq %r9,%r8 movq %r8,%r10 movq %r9,%r11 leaq (%rax,%rbp,1),%rax movq %rbp,%r12 cmovbq %r9,%r8 cmovbq %r10,%r9 cmovbq %rax,%rbp subq %r9,%r8 testq $1,%r10 cmovzq %r10,%r8 cmovzq %r11,%r9 cmovzq %r12,%rbp leaq 2(%r9),%rax shrq $1,%r8 shrq $2,%rax addq %rax,%rbp subl $1,%edi jnz .Loop_48 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __inner_loop_48,.-__inner_loop_48 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/ctq_inverse_mod_384-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .globl ct_inverse_mod_384 .hidden ct_inverse_mod_384 .type ct_inverse_mod_384,@function .align 32 ct_inverse_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz ct_inverse_mod_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $1112,%rsp .cfi_adjust_cfa_offset 1112 leaq 88+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 0(%rdx),%r14 movq 8(%rdx),%r15 movq 16(%rdx),%rbx movq 24(%rdx),%rbp movq 32(%rdx),%rsi movq 40(%rdx),%rdi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rbp,72(%rax) movq %rsi,80(%rax) movq %rax,%rsi movq %rdi,88(%rax) movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,96(%rdi) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,104(%rdi) xorq $256,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq 96(%rsi),%rax movq 152(%rsi),%r11 movq %rdx,%rbx movq %rax,%r10 imulq 56(%rsp) movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq 64(%rsp) addq %rax,%r8 adcq %rdx,%r9 movq %r8,48(%rdi) movq %r9,56(%rdi) sarq $63,%r9 movq %r9,64(%rdi) movq %r9,72(%rdi) movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) leaq 96(%rsi),%rsi movq %r10,%rax imulq %rbx movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq %rcx addq %rax,%r8 adcq %rdx,%r9 movq %r8,104(%rdi) movq %r9,112(%rdi) sarq $63,%r9 movq %r9,120(%rdi) movq %r9,128(%rdi) movq %r9,136(%rdi) movq %r9,144(%rdi) movq %r9,152(%rdi) xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 movq %r14,56(%rdi) movq %r14,64(%rdi) movq %r14,72(%rdi) movq %r14,80(%rdi) movq %r14,88(%rdi) xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 48(%rsi),%r10 movq 56(%rsi),%r11 call __inner_loop_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi movq %r8,0(%rdi) movq %r10,48(%rdi) leaq 96(%rsi),%rsi leaq 96(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $24,%edi movq 0(%rsi),%r8 xorq %r9,%r9 movq 48(%rsi),%r10 xorq %r11,%r11 call __inner_loop_62 leaq 96(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulq_768x63 movq 40(%rsp),%rsi movq %rdx,%r13 sarq $63,%r13 movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %rdx orq %rdx,%r13 sarq $63,%rdx movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 xorq %rdx,%r8 xorq %rsi,%rsi xorq %rdx,%r9 subq %rdx,%rsi xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) leaq 1112(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -1112-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size ct_inverse_mod_384,.-ct_inverse_mod_384 .type __smulq_768x63,@function .align 32 __smulq_768x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 56(%rsi),%rsi xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,0(%rdi) movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 movq %r9,8(%rdi) mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 movq %r10,16(%rdi) mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 movq %r11,24(%rdi) mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 movq %r12,32(%rdi) mulq %rbp addq %rax,%r13 adcq %rdx,%r14 movq %r13,40(%rdi) movq %r14,48(%rdi) sarq $63,%r14 movq %r14,56(%rdi) movq %rcx,%rdx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq 56(%rsi),%r15 movq 64(%rsi),%rbx movq 72(%rsi),%rbp movq 80(%rsi),%rcx movq 88(%rsi),%rdi movq %rdx,%rsi sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rsi addq %rax,%rsi xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 xorq %rdx,%rbx xorq %rdx,%rbp xorq %rdx,%rcx xorq %rdx,%rdi addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rbx adcq $0,%rbp adcq $0,%rcx adcq $0,%rdi mulq %rsi movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rsi addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rsi addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rsi addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rsi addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rsi addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rsi addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rsi addq %rax,%r15 movq %rbx,%rax adcq $0,%rdx movq %rdx,%rbx mulq %rsi addq %rax,%rbx movq %rbp,%rax adcq $0,%rdx movq %rdx,%rbp mulq %rsi addq %rax,%rbp movq %rcx,%rax adcq $0,%rdx movq %rdx,%rcx mulq %rsi addq %rax,%rcx movq %rdi,%rax adcq $0,%rdx movq %rdx,%rdi imulq %rsi movq 8(%rsp),%rsi addq %rdi,%rax adcq $0,%rdx addq 0(%rsi),%r8 adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 adcq 24(%rsi),%r11 adcq 32(%rsi),%r12 adcq 40(%rsi),%r13 adcq 48(%rsi),%r14 movq 56(%rsi),%rdi adcq %rdi,%r15 adcq %rdi,%rbx adcq %rdi,%rbp adcq %rdi,%rcx adcq %rdi,%rax adcq %rdi,%rdx leaq (%rsi),%rdi movq 16(%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulq_768x63,.-__smulq_768x63 .type __smulq_384x63,@function .align 32 __smulq_384x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq 56(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,%r15 movq %r14,%rbx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq -56(%rsi),%rsi addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq %r15,%r13 adcq %rbx,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulq_384x63,.-__smulq_384x63 .type __smulq_384_n_shift_by_62,@function .align 32 __smulq_384_n_shift_by_62: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 movq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq 48(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 movq %rdx,%r15 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r15 negq %r15 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r15 leaq -48(%rsi),%rsi movq %rbx,%rdx addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 shrdq $62,%r9,%r8 shrdq $62,%r10,%r9 shrdq $62,%r11,%r10 shrdq $62,%r12,%r11 shrdq $62,%r13,%r12 shrdq $62,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulq_384_n_shift_by_62,.-__smulq_384_n_shift_by_62 .type __ab_approximation_62,@function .align 32 __ab_approximation_62: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 40(%rsi),%r9 movq 88(%rsi),%r11 movq 32(%rsi),%rbx movq 80(%rsi),%rbp movq 24(%rsi),%r8 movq 72(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 16(%rsi),%r8 movq 64(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 8(%rsi),%r8 movq 56(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 0(%rsi),%r8 movq 48(%rsi),%r10 movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 jmp __inner_loop_62 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __ab_approximation_62,.-__ab_approximation_62 .type __inner_loop_62,@function .align 8 .long 0 __inner_loop_62: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq $1,%r13 movq %rsi,8(%rsp) .Loop_62: xorq %rax,%rax xorq %rbx,%rbx testq $1,%r8 movq %r10,%rbp movq %r11,%r14 cmovnzq %r10,%rax cmovnzq %r11,%rbx subq %r8,%rbp sbbq %r9,%r14 movq %r8,%r15 movq %r9,%rsi subq %rax,%r8 sbbq %rbx,%r9 cmovcq %rbp,%r8 cmovcq %r14,%r9 cmovcq %r15,%r10 cmovcq %rsi,%r11 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrdq $1,%r9,%r8 shrq $1,%r9 testq $1,%r15 cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%edi jnz .Loop_62 movq 8(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rax lfence jmpq *%rax ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __inner_loop_62,.-__inner_loop_62 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/ctx_inverse_mod_384-x86_64.s ================================================ .text .globl ctx_inverse_mod_384 .hidden ctx_inverse_mod_384 .type ctx_inverse_mod_384,@function .align 32 ctx_inverse_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa ct_inverse_mod_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $1112,%rsp .cfi_adjust_cfa_offset 1112 leaq 88+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 0(%rdx),%r14 movq 8(%rdx),%r15 movq 16(%rdx),%rbx movq 24(%rdx),%rbp movq 32(%rdx),%rsi movq 40(%rdx),%rdi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rbp,72(%rax) movq %rsi,80(%rax) movq %rax,%rsi movq %rdi,88(%rax) movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,96(%rdi) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,104(%rdi) xorq $256,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq 96(%rsi),%rax movq 152(%rsi),%r11 movq %rdx,%rbx movq %rax,%r10 imulq 56(%rsp) movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq 64(%rsp) addq %rax,%r8 adcq %rdx,%r9 movq %r8,48(%rdi) movq %r9,56(%rdi) sarq $63,%r9 movq %r9,64(%rdi) movq %r9,72(%rdi) movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) leaq 96(%rsi),%rsi movq %r10,%rax imulq %rbx movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq %rcx addq %rax,%r8 adcq %rdx,%r9 movq %r8,104(%rdi) movq %r9,112(%rdi) sarq $63,%r9 movq %r9,120(%rdi) movq %r9,128(%rdi) movq %r9,136(%rdi) movq %r9,144(%rdi) movq %r9,152(%rdi) xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 movq %r14,56(%rdi) movq %r14,64(%rdi) movq %r14,72(%rdi) movq %r14,80(%rdi) movq %r14,88(%rdi) xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $55,%edi movq 0(%rsi),%r8 movq 48(%rsi),%r10 call __tail_loop_55 leaq 96(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulx_768x63 movq 40(%rsp),%rsi movq %rdx,%r13 sarq $63,%r13 movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %rdx orq %rdx,%r13 sarq $63,%rdx movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 xorq %rdx,%r8 xorq %rsi,%rsi xorq %rdx,%r9 subq %rdx,%rsi xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) leaq 1112(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -1112-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size ctx_inverse_mod_384,.-ctx_inverse_mod_384 .type __smulx_768x63,@function .align 32 __smulx_768x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 56(%rsi),%rsi xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 xorq %rax,%r14 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax adcq %rbp,%r13 adcq %rax,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) sarq $63,%r14 movq %r14,56(%rdi) movq %rcx,%rdx movq %rcx,%rax movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq 56(%rsi),%r15 movq 64(%rsi),%rbx movq 72(%rsi),%rbp movq 80(%rsi),%rcx movq 88(%rsi),%rdi sarq $63,%rax xorq %rsi,%rsi subq %rax,%rsi xorq %rax,%rdx addq %rsi,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 xorq %rax,%r14 xorq %rax,%r15 xorq %rax,%rbx xorq %rax,%rbp xorq %rax,%rcx xorq %rdi,%rax addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rbx adcq $0,%rbp adcq $0,%rcx adcq $0,%rax mulxq %r8,%r8,%rsi mulxq %r9,%r9,%rdi addq %rsi,%r9 mulxq %r10,%r10,%rsi adcq %rdi,%r10 mulxq %r11,%r11,%rdi adcq %rsi,%r11 mulxq %r12,%r12,%rsi adcq %rdi,%r12 mulxq %r13,%r13,%rdi adcq %rsi,%r13 mulxq %r14,%r14,%rsi adcq %rdi,%r14 mulxq %r15,%r15,%rdi adcq %rsi,%r15 mulxq %rbx,%rbx,%rsi adcq %rdi,%rbx mulxq %rbp,%rbp,%rdi adcq %rsi,%rbp mulxq %rcx,%rcx,%rsi adcq %rdi,%rcx movq 8(%rsp),%rdi adcq $0,%rsi imulq %rdx addq %rsi,%rax adcq $0,%rdx addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 movq 56(%rdi),%rsi adcq %rsi,%r15 adcq %rsi,%rbx adcq %rsi,%rbp adcq %rsi,%rcx adcq %rsi,%rax adcq %rsi,%rdx movq 16(%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulx_768x63,.-__smulx_768x63 .type __smulx_384x63,@function .align 32 __smulx_384x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%r12 movq 0+40(%rsi),%r13 movq 0+48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rdx addq %rax,%rdx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 xorq %rbp,%r12 xorq %rbp,%r13 xorq %rbp,%r14 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax movq %rcx,%rdx adcq %rbp,%r13 adcq %rax,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,%r15 movq %r14,%rbx movq 56+0(%rsi),%r8 movq 56+8(%rsi),%r9 movq 56+16(%rsi),%r10 movq 56+24(%rsi),%r11 movq 56+32(%rsi),%r12 movq 56+40(%rsi),%r13 movq 56+48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rdx addq %rax,%rdx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 xorq %rbp,%r12 xorq %rbp,%r13 xorq %rbp,%r14 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax adcq %rbp,%r13 adcq %rax,%r14 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq %r15,%r13 adcq %rbx,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulx_384x63,.-__smulx_384x63 .type __smulx_384_n_shift_by_31,@function .align 32 __smulx_384_n_shift_by_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%r12 movq 0+40(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 andq %rdx,%rax negq %rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r14 addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %r14,%r10 mulxq %r11,%r11,%r14 adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %r14,%r12 mulxq %r13,%r13,%r14 adcq %rbp,%r13 adcq %rax,%r14 movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,%r15 movq 48+0(%rsi),%r8 movq 48+8(%rsi),%r9 movq 48+16(%rsi),%r10 movq 48+24(%rsi),%r11 movq 48+32(%rsi),%r12 movq 48+40(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 andq %rdx,%rax negq %rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r14 addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %r14,%r10 mulxq %r11,%r11,%r14 adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %r14,%r12 mulxq %r13,%r13,%r14 adcq %rbp,%r13 adcq %rax,%r14 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 movq %rbx,%rdx shrdq $31,%r9,%r8 shrdq $31,%r10,%r9 shrdq $31,%r11,%r10 shrdq $31,%r12,%r11 shrdq $31,%r13,%r12 shrdq $31,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulx_384_n_shift_by_31,.-__smulx_384_n_shift_by_31 .type __smulx_191_n_shift_by_31,@function .align 32 __smulx_191_n_shift_by_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %r10,%rax addq %rbp,%r8 adcq $0,%r9 adcq $0,%rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r10 addq %rbp,%r9 adcq $0,%r10 imulq %rdx addq %rax,%r10 adcq $0,%rdx movq %rdx,%r14 movq %rcx,%rdx movq 48+0(%rsi),%r11 movq 48+8(%rsi),%r12 movq 48+16(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r11 xorq %rax,%r12 xorq %r13,%rax addq %rbp,%r11 adcq $0,%r12 adcq $0,%rax mulxq %r11,%r11,%rbp mulxq %r12,%r12,%r13 addq %rbp,%r12 adcq $0,%r13 imulq %rdx addq %rax,%r13 adcq $0,%rdx addq %r8,%r11 adcq %r9,%r12 adcq %r10,%r13 adcq %rdx,%r14 movq %rbx,%rdx shrdq $31,%r12,%r11 shrdq $31,%r13,%r12 shrdq $31,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r11 adcq $0,%r12 adcq $0,%r13 movq %r11,0(%rdi) movq %r12,8(%rdi) movq %r13,16(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 .type __ab_approximation_31,@function .align 32 __ab_approximation_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 40(%rsi),%r9 movq 88(%rsi),%r11 movq 32(%rsi),%rbx movq 80(%rsi),%rbp movq 24(%rsi),%r8 movq 72(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 16(%rsi),%r8 cmovzq %r10,%rbp movq 64(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 8(%rsi),%r8 cmovzq %r10,%rbp movq 56(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 0(%rsi),%r8 cmovzq %r10,%rbp movq 48(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r9 cmovzq %r10,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 movl $0x7FFFFFFF,%eax andq %rax,%r8 andq %rax,%r10 andnq %r9,%rax,%r9 andnq %r11,%rax,%r11 orq %r9,%r8 orq %r11,%r10 jmp __inner_loop_31 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __ab_approximation_31,.-__ab_approximation_31 .type __inner_loop_31,@function .align 32 __inner_loop_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rcx movq $0x800000007FFFFFFF,%r13 movq $0x7FFFFFFF7FFFFFFF,%r15 .Loop_31: cmpq %r10,%r8 movq %r8,%rax movq %r10,%rbx movq %rcx,%rbp movq %r13,%r14 cmovbq %r10,%r8 cmovbq %rax,%r10 cmovbq %r13,%rcx cmovbq %rbp,%r13 subq %r10,%r8 subq %r13,%rcx addq %r15,%rcx testq $1,%rax cmovzq %rax,%r8 cmovzq %rbx,%r10 cmovzq %rbp,%rcx cmovzq %r14,%r13 shrq $1,%r8 addq %r13,%r13 subq %r15,%r13 subl $1,%edi jnz .Loop_31 shrq $32,%r15 movl %ecx,%edx movl %r13d,%r12d shrq $32,%rcx shrq $32,%r13 subq %r15,%rdx subq %r15,%rcx subq %r15,%r12 subq %r15,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __inner_loop_31,.-__inner_loop_31 .type __tail_loop_55,@function .align 32 __tail_loop_55: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq $1,%r13 .Loop_55: xorq %rax,%rax testq $1,%r8 movq %r10,%rbx cmovnzq %r10,%rax subq %r8,%rbx movq %r8,%rbp subq %rax,%r8 cmovcq %rbx,%r8 cmovcq %rbp,%r10 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrq $1,%r8 testq $1,%rbp cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%edi jnz .Loop_55 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __tail_loop_55,.-__tail_loop_55 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/div3w-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl div_3_limbs .hidden div_3_limbs .type div_3_limbs,%function .align 5 div_3_limbs: hint #34 ldp x4,x5,[x0] // load R eor x0,x0,x0 // Q = 0 mov x3,#64 // loop counter nop .Loop: subs x6,x4,x1 // R - D add x0,x0,x0 // Q <<= 1 sbcs x7,x5,x2 add x0,x0,#1 // Q + speculative bit csel x4,x4,x6,lo // select between R and R - D extr x1,x2,x1,#1 // D >>= 1 csel x5,x5,x7,lo lsr x2,x2,#1 sbc x0,x0,xzr // subtract speculative bit sub x3,x3,#1 cbnz x3,.Loop asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit orr x0,x0,x3 // all ones if overflow ret .size div_3_limbs,.-div_3_limbs .globl quot_rem_128 .hidden quot_rem_128 .type quot_rem_128,%function .align 5 quot_rem_128: hint #34 ldp x3,x4,[x1] mul x5,x3,x2 // divisor[0:1} * quotient umulh x6,x3,x2 mul x11, x4,x2 umulh x7,x4,x2 ldp x8,x9,[x0] // load 3 limbs of the dividend ldr x10,[x0,#16] adds x6,x6,x11 adc x7,x7,xzr subs x8,x8,x5 // dividend - divisor * quotient sbcs x9,x9,x6 sbcs x10,x10,x7 sbc x5,xzr,xzr // borrow -> mask add x2,x2,x5 // if borrowed, adjust the quotient ... and x3,x3,x5 and x4,x4,x5 adds x8,x8,x3 // ... and add divisor adc x9,x9,x4 stp x8,x9,[x0] // save 2 limbs of the remainder str x2,[x0,#16] // and one limb of the quotient mov x0,x2 // return adjusted quotient ret .size quot_rem_128,.-quot_rem_128 .globl quot_rem_64 .hidden quot_rem_64 .type quot_rem_64,%function .align 5 quot_rem_64: hint #34 ldr x3,[x1] ldr x8,[x0] // load 1 limb of the dividend mul x5,x3,x2 // divisor * quotient sub x8,x8,x5 // dividend - divisor * quotient stp x8,x2,[x0] // save remainder and quotient mov x0,x2 // return quotient ret .size quot_rem_64,.-quot_rem_64 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/div3w-x86_64.s ================================================ .text .globl div_3_limbs .hidden div_3_limbs .type div_3_limbs,@function .align 32 div_3_limbs: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq (%rdi),%r8 movq 8(%rdi),%r9 xorq %rax,%rax movl $64,%ecx .Loop: movq %r8,%r10 subq %rsi,%r8 movq %r9,%r11 sbbq %rdx,%r9 leaq 1(%rax,%rax,1),%rax movq %rdx,%rdi cmovcq %r10,%r8 cmovcq %r11,%r9 sbbq $0,%rax shlq $63,%rdi shrq $1,%rsi shrq $1,%rdx orq %rdi,%rsi subl $1,%ecx jnz .Loop leaq 1(%rax,%rax,1),%rcx sarq $63,%rax subq %rsi,%r8 sbbq %rdx,%r9 sbbq $0,%rcx orq %rcx,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size div_3_limbs,.-div_3_limbs .globl quot_rem_128 .hidden quot_rem_128 .type quot_rem_128,@function .align 32 quot_rem_128: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq %rdx,%rax movq %rdx,%rcx mulq 0(%rsi) movq %rax,%r8 movq %rcx,%rax movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r9 adcq $0,%rdx movq 0(%rdi),%r10 movq 8(%rdi),%r11 movq 16(%rdi),%rax subq %r8,%r10 sbbq %r9,%r11 sbbq %rdx,%rax sbbq %r8,%r8 addq %r8,%rcx movq %r8,%r9 andq 0(%rsi),%r8 andq 8(%rsi),%r9 addq %r8,%r10 adcq %r9,%r11 movq %r10,0(%rdi) movq %r11,8(%rdi) movq %rcx,16(%rdi) movq %rcx,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size quot_rem_128,.-quot_rem_128 .globl quot_rem_64 .hidden quot_rem_64 .type quot_rem_64,@function .align 32 quot_rem_64: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq %rdx,%rax imulq 0(%rsi),%rdx movq 0(%rdi),%r10 subq %rdx,%r10 movq %r10,0(%rdi) movq %rax,8(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size quot_rem_64,.-quot_rem_64 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/mul_mont_256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl mul_mont_sparse_256 .hidden mul_mont_sparse_256 .type mul_mont_sparse_256,%function .align 5 mul_mont_sparse_256: hint #34 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldr x9, [x2] ldp x12,x13,[x1,#16] mul x19,x10,x9 ldp x5,x6,[x3] mul x20,x11,x9 ldp x7,x8,[x3,#16] mul x21,x12,x9 mul x22,x13,x9 umulh x14,x10,x9 umulh x15,x11,x9 mul x3,x4,x19 umulh x16,x12,x9 umulh x17,x13,x9 adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,xzr, x17 mul x17,x8,x3 ldr x9,[x2,8*1] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*2] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*3] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 adcs x20,x21,x15 adcs x21,x22,x16 adcs x22,x23,x17 adc x23,xzr,xzr subs x14,x19,x5 sbcs x15,x20,x6 sbcs x16,x21,x7 sbcs x17,x22,x8 sbcs xzr, x23,xzr csel x19,x19,x14,lo csel x20,x20,x15,lo csel x21,x21,x16,lo csel x22,x22,x17,lo stp x19,x20,[x0] stp x21,x22,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ ret .size mul_mont_sparse_256,.-mul_mont_sparse_256 .globl sqr_mont_sparse_256 .hidden sqr_mont_sparse_256 .type sqr_mont_sparse_256,%function .align 5 sqr_mont_sparse_256: hint #PACI_HINT stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x5,x6,[x1] ldp x7,x8,[x1,#16] mov x4,x3 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x11,x6,x5 // a[1]*a[0] umulh x15,x6,x5 mul x12,x7,x5 // a[2]*a[0] umulh x16,x7,x5 mul x13,x8,x5 // a[3]*a[0] umulh x19,x8,x5 adds x12,x12,x15 // accumulate high parts of multiplication mul x14,x7,x6 // a[2]*a[1] umulh x15,x7,x6 adcs x13,x13,x16 mul x16,x8,x6 // a[3]*a[1] umulh x17,x8,x6 adc x19,x19,xzr // can't overflow mul x20,x8,x7 // a[3]*a[2] umulh x21,x8,x7 adds x15,x15,x16 // accumulate high parts of multiplication mul x10,x5,x5 // a[0]*a[0] adc x16,x17,xzr // can't overflow adds x13,x13,x14 // accumulate low parts of multiplication umulh x5,x5,x5 adcs x19,x19,x15 mul x15,x6,x6 // a[1]*a[1] adcs x20,x20,x16 umulh x6,x6,x6 adc x21,x21,xzr // can't overflow adds x11,x11,x11 // acc[1-6]*=2 mul x16,x7,x7 // a[2]*a[2] adcs x12,x12,x12 umulh x7,x7,x7 adcs x13,x13,x13 mul x17,x8,x8 // a[3]*a[3] adcs x19,x19,x19 umulh x8,x8,x8 adcs x20,x20,x20 adcs x21,x21,x21 adc x22,xzr,xzr adds x11,x11,x5 // +a[i]*a[i] adcs x12,x12,x15 adcs x13,x13,x6 adcs x19,x19,x16 adcs x20,x20,x7 adcs x21,x21,x17 adc x22,x22,x8 bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] adds x10,x10,x19 // accumulate upper half adcs x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adc x19,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x19,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_sparse_256,.-sqr_mont_sparse_256 .globl from_mont_256 .hidden from_mont_256 .type from_mont_256,%function .align 5 from_mont_256: hint #PACI_HINT stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size from_mont_256,.-from_mont_256 .globl redc_mont_256 .hidden redc_mont_256 .type redc_mont_256,%function .align 5 redc_mont_256: hint #PACI_HINT stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x14,x15,[x1,#32] ldp x16,x17,[x1,#48] adds x10,x10,x14 adcs x11,x11,x15 adcs x12,x12,x16 adcs x13,x13,x17 adc x9,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x9,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size redc_mont_256,.-redc_mont_256 .type __mul_by_1_mont_256,%function .align 5 __mul_by_1_mont_256: mul x3,x4,x10 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 adc x13,x9,x17 ret .size __mul_by_1_mont_256,.-__mul_by_1_mont_256 #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/mul_mont_384-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif .text .globl add_mod_384x384 .hidden add_mod_384x384 .type add_mod_384x384,%function .align 5 add_mod_384x384: hint #PACI_HINT stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __add_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size add_mod_384x384,.-add_mod_384x384 .type __add_mod_384x384,%function .align 5 __add_mod_384x384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 stp x11, x12, [x0] adcs x15,x15,x23 ldp x11, x12, [x1,#48] adcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] adcs x11,x11,x19 stp x15, x16, [x0,#32] adcs x12,x12,x20 ldp x15, x16, [x1,#80] adcs x13,x13,x21 ldp x23,x24,[x2,#80] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo stp x11,x12,[x0,#48] csel x15,x15,x23,lo stp x13,x14,[x0,#64] csel x16,x16,x24,lo stp x15,x16,[x0,#80] ret .size __add_mod_384x384,.-__add_mod_384x384 .globl sub_mod_384x384 .hidden sub_mod_384x384 .type sub_mod_384x384,%function .align 5 sub_mod_384x384: hint #PACI_HINT stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sub_mod_384x384,.-sub_mod_384x384 .type __sub_mod_384x384,%function .align 5 __sub_mod_384x384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 stp x11, x12, [x0] sbcs x15,x15,x23 ldp x11, x12, [x1,#48] sbcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] sbcs x11,x11,x19 stp x15, x16, [x0,#32] sbcs x12,x12,x20 ldp x15, x16, [x1,#80] sbcs x13,x13,x21 ldp x23,x24,[x2,#80] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] ret .size __sub_mod_384x384,.-__sub_mod_384x384 .type __add_mod_384,%function .align 5 __add_mod_384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo stp x11,x12,[x0] csel x16,x16,x24,lo stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .size __add_mod_384,.-__add_mod_384 .type __sub_mod_384,%function .align 5 __sub_mod_384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0] adc x16,x16,x24 stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .size __sub_mod_384,.-__sub_mod_384 .globl mul_mont_384x .hidden mul_mont_384x .type mul_mont_384x,%function .align 5 mul_mont_384x: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#288 // space for 3 768-bit vectors mov x26,x0 // save r_ptr mov x27,x1 // save b_ptr mov x28,x2 // save b_ptr add x0,sp,#0 bl __mul_384 add x1,x1,#48 add x2,x2,#48 add x0,sp,#96 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] sub x2,x1,#48 add x0,sp,#240 bl __add_mod_384 add x1,x28,#0 add x2,x28,#48 add x0,sp,#192 bl __add_mod_384 add x1,x0,#0 add x2,x0,#48 bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,x0 add x2,sp,#0 bl __sub_mod_384x384 add x2,sp,#96 bl __sub_mod_384x384 // t2 = t2-t0-t1 add x1,sp,#0 add x2,sp,#96 add x0,sp,#0 bl __sub_mod_384x384 // t0 = t0-t1 add x1,sp,#0 add x0,x26,#0 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 add x1,sp,#192 add x0,x0,#48 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#288 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_mont_384x,.-mul_mont_384x .globl sqr_mont_384x .hidden sqr_mont_384x .type sqr_mont_384x,%function .align 5 sqr_mont_384x: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#96 // space for 2 384-bit vectors mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 add x0,sp,#0 bl __add_mod_384 // t0 = a->re + a->im add x0,sp,#48 bl __sub_mod_384 // t1 = a->re - a->im ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) adds x11,x11,x11 // add with itself adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x19,x11,x19,lo csel x20,x12,x20,lo csel x21,x13,x21,lo ldp x11,x12,[sp] csel x22,x14,x22,lo ldr x17, [sp,#48] csel x23,x15,x23,lo ldp x13,x14,[sp,#16] csel x24,x16,x24,lo ldp x15,x16,[sp,#32] stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] add x2,sp,#48 bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_384x,.-sqr_mont_384x .globl mul_mont_384 .hidden mul_mont_384 .type mul_mont_384,%function .align 5 mul_mont_384: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_mont_384,.-mul_mont_384 .type __mul_mont_384,%function .align 5 __mul_mont_384: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 mov x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*1] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*2] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*3] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*4] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*5] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] // pull r_ptr adc x17,x17,xzr adds x19,x20,x26 adcs x20,x21,x27 adcs x21,x22,x28 adcs x22,x23,x0 adcs x23,x24,x1 adcs x24,x25,x3 adc x25,x17,xzr subs x26,x19,x5 sbcs x27,x20,x6 sbcs x28,x21,x7 sbcs x0,x22,x8 sbcs x1,x23,x9 sbcs x3,x24,x10 sbcs xzr, x25,xzr csel x11,x19,x26,lo csel x12,x20,x27,lo csel x13,x21,x28,lo csel x14,x22,x0,lo csel x15,x23,x1,lo csel x16,x24,x3,lo ret .size __mul_mont_384,.-__mul_mont_384 .globl sqr_mont_384 .hidden sqr_mont_384 .type sqr_mont_384,%function .align 5 sqr_mont_384: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 // space for 768-bit vector mov x4,x3 // adjust for missing b_ptr mov x3,x0 // save r_ptr mov x0,sp ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] mov x1,sp mov x0,x3 // restore r_ptr bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_384,.-sqr_mont_384 .globl sqr_n_mul_mont_383 .hidden sqr_n_mul_mont_383 .type sqr_n_mul_mont_383,%function .align 5 sqr_n_mul_mont_383: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#96 // space for 768-bit vector mov x17,x5 // save b_ptr ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mov x0,sp .Loop_sqr_383: bl __sqr_384 sub x2,x2,#1 // counter ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,sp bl __mul_by_1_mont_384 ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 // just accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 cbnz x2,.Loop_sqr_383 mov x2,x17 ldr x17,[x17] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 .type __sqr_384,%function .align 5 __sqr_384: mul x19,x12,x11 mul x20,x13,x11 mul x21,x14,x11 mul x22,x15,x11 mul x23,x16,x11 umulh x6,x12,x11 umulh x7,x13,x11 umulh x8,x14,x11 umulh x9,x15,x11 adds x20,x20,x6 umulh x10,x16,x11 adcs x21,x21,x7 mul x7,x13,x12 adcs x22,x22,x8 mul x8,x14,x12 adcs x23,x23,x9 mul x9,x15,x12 adc x24,xzr, x10 mul x10,x16,x12 adds x21,x21,x7 umulh x7,x13,x12 adcs x22,x22,x8 umulh x8,x14,x12 adcs x23,x23,x9 umulh x9,x15,x12 adcs x24,x24,x10 umulh x10,x16,x12 adc x25,xzr,xzr mul x5,x11,x11 adds x22,x22,x7 umulh x11, x11,x11 adcs x23,x23,x8 mul x8,x14,x13 adcs x24,x24,x9 mul x9,x15,x13 adc x25,x25,x10 mul x10,x16,x13 adds x23,x23,x8 umulh x8,x14,x13 adcs x24,x24,x9 umulh x9,x15,x13 adcs x25,x25,x10 umulh x10,x16,x13 adc x26,xzr,xzr mul x6,x12,x12 adds x24,x24,x8 umulh x12, x12,x12 adcs x25,x25,x9 mul x9,x15,x14 adc x26,x26,x10 mul x10,x16,x14 adds x25,x25,x9 umulh x9,x15,x14 adcs x26,x26,x10 umulh x10,x16,x14 adc x27,xzr,xzr mul x7,x13,x13 adds x26,x26,x9 umulh x13, x13,x13 adc x27,x27,x10 mul x8,x14,x14 mul x10,x16,x15 umulh x14, x14,x14 adds x27,x27,x10 umulh x10,x16,x15 mul x9,x15,x15 adc x28,x10,xzr adds x19,x19,x19 adcs x20,x20,x20 adcs x21,x21,x21 adcs x22,x22,x22 adcs x23,x23,x23 adcs x24,x24,x24 adcs x25,x25,x25 adcs x26,x26,x26 umulh x15, x15,x15 adcs x27,x27,x27 mul x10,x16,x16 adcs x28,x28,x28 umulh x16, x16,x16 adc x1,xzr,xzr adds x19,x19,x11 adcs x20,x20,x6 adcs x21,x21,x12 adcs x22,x22,x7 adcs x23,x23,x13 adcs x24,x24,x8 adcs x25,x25,x14 stp x5,x19,[x0] adcs x26,x26,x9 stp x20,x21,[x0,#16] adcs x27,x27,x15 stp x22,x23,[x0,#32] adcs x28,x28,x10 stp x24,x25,[x0,#48] adc x16,x16,x1 stp x26,x27,[x0,#64] stp x28,x16,[x0,#80] ret .size __sqr_384,.-__sqr_384 .globl sqr_384 .hidden sqr_384 .type sqr_384,%function .align 5 sqr_384: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_384,.-sqr_384 .globl redc_mont_384 .hidden redc_mont_384 .type redc_mont_384,%function .align 5 redc_mont_384: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size redc_mont_384,.-redc_mont_384 .globl from_mont_384 .hidden from_mont_384 .type from_mont_384,%function .align 5 from_mont_384: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size from_mont_384,.-from_mont_384 .type __mul_by_1_mont_384,%function .align 5 __mul_by_1_mont_384: ldp x11,x12,[x1] ldp x13,x14,[x1,#16] mul x26,x4,x11 ldp x15,x16,[x1,#32] // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 ret .size __mul_by_1_mont_384,.-__mul_by_1_mont_384 .type __redc_tail_mont_384,%function .align 5 __redc_tail_mont_384: ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 // accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .size __redc_tail_mont_384,.-__redc_tail_mont_384 .globl mul_384 .hidden mul_384 .type mul_384,%function .align 5 mul_384: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_384,.-mul_384 .type __mul_384,%function .align 5 __mul_384: ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 umulh x5,x11,x17 umulh x6,x12,x17 umulh x7,x13,x17 umulh x8,x14,x17 umulh x9,x15,x17 umulh x10,x16,x17 ldr x17,[x2,8*1] str x19,[x0] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,xzr, x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(1+1)] adc x25,xzr,xzr str x19,[x0,8*1] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(2+1)] adc x25,xzr,xzr str x19,[x0,8*2] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(3+1)] adc x25,xzr,xzr str x19,[x0,8*3] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(4+1)] adc x25,xzr,xzr str x19,[x0,8*4] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 adc x25,xzr,xzr str x19,[x0,8*5] adds x19,x20,x5 adcs x20,x21,x6 adcs x21,x22,x7 adcs x22,x23,x8 adcs x23,x24,x9 adc x24,x25,x10 stp x19,x20,[x0,#48] stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ret .size __mul_384,.-__mul_384 .globl mul_382x .hidden mul_382x .type mul_382x,%function .align 5 mul_382x: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 // space for two 384-bit vectors ldp x11,x12,[x1] mov x26,x0 // save r_ptr ldp x19,x20,[x1,#48] mov x27,x1 // save a_ptr ldp x13,x14,[x1,#16] mov x28,x2 // save b_ptr ldp x21,x22,[x1,#64] ldp x15,x16,[x1,#32] adds x5,x11,x19 // t0 = a->re + a->im ldp x23,x24,[x1,#80] adcs x6,x12,x20 ldp x11,x12,[x2] adcs x7,x13,x21 ldp x19,x20,[x2,#48] adcs x8,x14,x22 ldp x13,x14,[x2,#16] adcs x9,x15,x23 ldp x21,x22,[x2,#64] adc x10,x16,x24 ldp x15,x16,[x2,#32] stp x5,x6,[sp] adds x5,x11,x19 // t1 = b->re + b->im ldp x23,x24,[x2,#80] adcs x6,x12,x20 stp x7,x8,[sp,#16] adcs x7,x13,x21 adcs x8,x14,x22 stp x9,x10,[sp,#32] adcs x9,x15,x23 stp x5,x6,[sp,#48] adc x10,x16,x24 stp x7,x8,[sp,#64] stp x9,x10,[sp,#80] bl __mul_384 // mul_384(ret->re, a->re, b->re) add x1,sp,#0 add x2,sp,#48 add x0,x26,#96 bl __mul_384 add x1,x27,#48 add x2,x28,#48 add x0,sp,#0 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] add x1,x26,#96 add x2,sp,#0 add x0,x26,#96 bl __sub_mod_384x384 add x2,x26,#0 bl __sub_mod_384x384 add x1,x26,#0 add x2,sp,#0 add x0,x26,#0 bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size mul_382x,.-mul_382x .globl sqr_382x .hidden sqr_382x .type sqr_382x,%function .align 5 sqr_382x: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x19,x20,[x1,#48] ldp x13,x14,[x1,#16] adds x5,x11,x19 // t0 = a->re + a->im ldp x21,x22,[x1,#64] adcs x6,x12,x20 ldp x15,x16,[x1,#32] adcs x7,x13,x21 ldp x23,x24,[x1,#80] adcs x8,x14,x22 stp x5,x6,[x0] adcs x9,x15,x23 ldp x5,x6,[x2] adc x10,x16,x24 stp x7,x8,[x0,#16] subs x11,x11,x19 // t1 = a->re - a->im ldp x7,x8,[x2,#16] sbcs x12,x12,x20 stp x9,x10,[x0,#32] sbcs x13,x13,x21 ldp x9,x10,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 adds x11,x11,x19 and x21,x7,x25 adcs x12,x12,x20 and x22,x8,x25 adcs x13,x13,x21 and x23,x9,x25 adcs x14,x14,x22 and x24,x10,x25 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] mov x4,x1 // save a_ptr add x1,x0,#0 add x2,x0,#48 bl __mul_384 add x1,x4,#0 add x2,x4,#48 add x0,x0,#96 bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x11,x12,[x0] ldp x13,x14,[x0,#16] adds x11,x11,x11 // add with itself ldp x15,x16,[x0,#32] adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adcs x19,x19,x19 adcs x20,x20,x20 stp x11,x12,[x0] adcs x21,x21,x21 stp x13,x14,[x0,#16] adcs x22,x22,x22 stp x15,x16,[x0,#32] adcs x23,x23,x23 stp x19,x20,[x0,#48] adc x24,x24,x24 stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_382x,.-sqr_382x .globl sqr_mont_382x .hidden sqr_mont_382x .type sqr_mont_382x,%function .align 5 sqr_mont_382x: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#112 // space for two 384-bit vectors + word mov x4,x3 // adjust for missing b_ptr ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x17,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x5,x11,x17 // t0 = a->re + a->im adcs x6,x12,x20 adcs x7,x13,x21 adcs x8,x14,x22 adcs x9,x15,x23 adc x10,x16,x24 subs x19,x11,x17 // t1 = a->re - a->im sbcs x20,x12,x20 sbcs x21,x13,x21 sbcs x22,x14,x22 sbcs x23,x15,x23 sbcs x24,x16,x24 sbc x25,xzr,xzr // borrow flag as mask stp x5,x6,[sp] stp x7,x8,[sp,#16] stp x9,x10,[sp,#32] stp x19,x20,[sp,#48] stp x21,x22,[sp,#64] stp x23,x24,[sp,#80] str x25,[sp,#96] ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) adds x19,x11,x11 // add with itself adcs x20,x12,x12 adcs x21,x13,x13 adcs x22,x14,x14 adcs x23,x15,x15 adc x24,x16,x16 stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] ldp x11,x12,[sp] ldr x17,[sp,#48] ldp x13,x14,[sp,#16] ldp x15,x16,[sp,#32] add x2,sp,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) ldr x30,[x29,#__SIZEOF_POINTER__] ldr x25,[sp,#96] // account for sign from a->re - a->im ldp x19,x20,[sp] ldp x21,x22,[sp,#16] ldp x23,x24,[sp,#32] and x19,x19,x25 and x20,x20,x25 and x21,x21,x25 and x22,x22,x25 and x23,x23,x25 and x24,x24,x25 subs x11,x11,x19 sbcs x12,x12,x20 sbcs x13,x13,x21 sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 and x21,x7,x25 and x22,x8,x25 and x23,x9,x25 and x24,x10,x25 adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#112 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sqr_mont_382x,.-sqr_mont_382x .type __mul_mont_383_nonred,%function .align 5 __mul_mont_383_nonred: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 ldr x17,[x2,8*1] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*2] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*3] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*4] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*5] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] // pull r_ptr adds x11,x20,x26 adcs x12,x21,x27 adcs x13,x22,x28 adcs x14,x23,x0 adcs x15,x24,x1 adcs x16,x25,x3 ret .size __mul_mont_383_nonred,.-__mul_mont_383_nonred .globl sgn0_pty_mont_384 .hidden sgn0_pty_mont_384 .type sgn0_pty_mont_384,%function .align 5 sgn0_pty_mont_384: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 adds x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sgn0_pty_mont_384,.-sgn0_pty_mont_384 .globl sgn0_pty_mont_384x .hidden sgn0_pty_mont_384x .type sgn0_pty_mont_384x,%function .align 5 sgn0_pty_mont_384x: hint #PACI_HINT stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 add x1,x1,#48 and x2,x11,#1 orr x3,x11,x12 adds x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 orr x3,x3,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x2,x2,x17 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 orr x1,x11,x12 adds x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 orr x1,x1,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #AUTI_HINT ret .size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/mulq_mont_256-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .globl mul_mont_sparse_256 .hidden mul_mont_sparse_256 .type mul_mont_sparse_256,@function .align 32 mul_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_mont_sparse_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 movq 0(%rdx),%rax movq 0(%rsi),%r13 movq 8(%rsi),%r14 movq 16(%rsi),%r12 movq 24(%rsi),%rbp movq %rdx,%rbx movq %rax,%r15 mulq %r13 movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 call __mulq_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_mont_sparse_256,.-mul_mont_sparse_256 .globl sqr_mont_sparse_256 .hidden sqr_mont_sparse_256 .type sqr_mont_sparse_256,@function .align 32 sqr_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_sparse_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 movq 0(%rsi),%rax movq %rcx,%r8 movq 8(%rsi),%r14 movq %rdx,%rcx movq 16(%rsi),%r12 leaq (%rsi),%rbx movq 24(%rsi),%rbp movq %rax,%r15 mulq %rax movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 call __mulq_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_mont_sparse_256,.-sqr_mont_sparse_256 .type __mulq_mont_sparse_256,@function .align 32 __mulq_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulq %r14 addq %rax,%r10 movq %r15,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r12 addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq 8(%rbx),%rax adcq $0,%rdx xorq %r14,%r14 movq %rdx,%r13 movq %r9,%rdi imulq %r8,%r9 movq %rax,%r15 mulq 0(%rsi) addq %rax,%r10 movq %r15,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r11 movq %r15,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r12 movq %r15,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq %rdx,%r14 xorq %r15,%r15 mulq 0(%rcx) addq %rax,%rdi movq %r9,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %rdi,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r12 movq 16(%rbx),%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx addq %rdx,%r13 adcq $0,%r14 adcq $0,%r15 movq %r10,%rdi imulq %r8,%r10 movq %rax,%r9 mulq 0(%rsi) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r12 movq %r9,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq %rdx,%r15 xorq %r9,%r9 mulq 0(%rcx) addq %rax,%rdi movq %r10,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %rdi,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r13 movq 24(%rbx),%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx addq %rdx,%r14 adcq $0,%r15 adcq $0,%r9 movq %r11,%rdi imulq %r8,%r11 movq %rax,%r10 mulq 0(%rsi) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r13 movq %r10,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %rbp,%r15 adcq %rdx,%r9 xorq %r10,%r10 mulq 0(%rcx) addq %rax,%rdi movq %r11,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %rdi,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx addq %rdx,%r15 adcq $0,%r9 adcq $0,%r10 imulq %r8,%rax movq 8(%rsp),%rsi movq %rax,%r11 mulq 0(%rcx) addq %rax,%r12 movq %r11,%rax adcq %rdx,%r12 mulq 8(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r12,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r14 movq %r11,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) movq %r14,%rbx addq %rbp,%r15 adcq $0,%rdx addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %rdx,%r9 adcq $0,%r10 movq %r15,%r12 subq 0(%rcx),%r13 sbbq 8(%rcx),%r14 sbbq 16(%rcx),%r15 movq %r9,%rbp sbbq 24(%rcx),%r9 sbbq $0,%r10 cmovcq %rax,%r13 cmovcq %rbx,%r14 cmovcq %r12,%r15 movq %r13,0(%rsi) cmovcq %rbp,%r9 movq %r14,8(%rsi) movq %r15,16(%rsi) movq %r9,24(%rsi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 .globl from_mont_256 .hidden from_mont_256 .type from_mont_256,@function .align 32 from_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz from_mont_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_256 movq %r14,%r10 movq %r15,%r11 movq %r9,%r12 subq 0(%rbx),%r13 sbbq 8(%rbx),%r14 sbbq 16(%rbx),%r15 sbbq 24(%rbx),%r9 cmovncq %r13,%rax cmovncq %r14,%r10 cmovncq %r15,%r11 movq %rax,0(%rdi) cmovncq %r9,%r12 movq %r10,8(%rdi) movq %r11,16(%rdi) movq %r12,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size from_mont_256,.-from_mont_256 .globl redc_mont_256 .hidden redc_mont_256 .type redc_mont_256,@function .align 32 redc_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz redc_mont_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_256 addq 32(%rsi),%r13 adcq 40(%rsi),%r14 movq %r13,%rax adcq 48(%rsi),%r15 movq %r14,%r10 adcq 56(%rsi),%r9 sbbq %rsi,%rsi movq %r15,%r11 subq 0(%rbx),%r13 sbbq 8(%rbx),%r14 sbbq 16(%rbx),%r15 movq %r9,%r12 sbbq 24(%rbx),%r9 sbbq $0,%rsi cmovncq %r13,%rax cmovncq %r14,%r10 cmovncq %r15,%r11 movq %rax,0(%rdi) cmovncq %r9,%r12 movq %r10,8(%rdi) movq %r11,16(%rdi) movq %r12,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size redc_mont_256,.-redc_mont_256 .type __mulq_by_1_mont_256,@function .align 32 __mulq_by_1_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 movq %rax,%r13 imulq %rcx,%rax movq %rax,%r9 mulq 0(%rbx) addq %rax,%r13 movq %r9,%rax adcq %rdx,%r13 mulq 8(%rbx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r13,%r10 adcq $0,%rdx movq %rdx,%r13 mulq 16(%rbx) movq %r10,%r14 imulq %rcx,%r10 addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r13,%r11 adcq $0,%rdx movq %rdx,%r13 mulq 24(%rbx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r13,%r12 adcq $0,%rdx movq %rdx,%r13 mulq 0(%rbx) addq %rax,%r14 movq %r10,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) movq %r11,%r15 imulq %rcx,%r11 addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r11,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) movq %r12,%r9 imulq %rcx,%r12 addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r9 movq %r12,%rax adcq %rdx,%r9 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/mulq_mont_384-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .type __subq_mod_384x384,@function .align 32 __subq_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __subq_mod_384x384,.-__subq_mod_384x384 .type __addq_mod_384,@function .align 32 __addq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __addq_mod_384,.-__addq_mod_384 .type __subq_mod_384,@function .align 32 __subq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __subq_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __subq_mod_384,.-__subq_mod_384 .globl mul_mont_384x .hidden mul_mont_384x .type mul_mont_384x,@function .align 32 mul_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_mont_384x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $328,%rsp .cfi_adjust_cfa_offset 328 movq %rdx,%rbx movq %rdi,32(%rsp) movq %rsi,24(%rsp) movq %rdx,16(%rsp) movq %rcx,8(%rsp) movq %r8,0(%rsp) leaq 40(%rsp),%rdi call __mulq_384 leaq 48(%rbx),%rbx leaq 48(%rsi),%rsi leaq 40+96(%rsp),%rdi call __mulq_384 movq 8(%rsp),%rcx leaq -48(%rsi),%rdx leaq 40+192+48(%rsp),%rdi call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi call __addq_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi call __mulq_384 leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx call __subq_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx call __subq_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi call __subq_mod_384x384 movq %rcx,%rbx leaq 40(%rsp),%rsi movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -328-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_mont_384x,.-mul_mont_384x .globl sqr_mont_384x .hidden sqr_mont_384x .type sqr_mont_384x,@function .align 32 sqr_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_384x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi call __subq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rbx movq 48(%rsi),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 call __mulq_mont_384 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 movq %r14,%r12 adcq %r9,%r9 movq %r15,%r13 adcq %r10,%r10 movq %r8,%rax adcq %r11,%r11 movq %r9,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %r10,%rbp sbbq 16(%rcx),%r8 sbbq 24(%rcx),%r9 sbbq 32(%rcx),%r10 movq %r11,%rsi sbbq 40(%rcx),%r11 sbbq $0,%rdx cmovcq %r12,%r14 cmovcq %r13,%r15 cmovcq %rax,%r8 movq %r14,48(%rdi) cmovcq %rbx,%r9 movq %r15,56(%rdi) cmovcq %rbp,%r10 movq %r8,64(%rdi) cmovcq %rsi,%r11 movq %r9,72(%rdi) movq %r10,80(%rdi) movq %r11,88(%rdi) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rax movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%r12 movq 32+24(%rsp),%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_mont_384x,.-sqr_mont_384x .globl mul_382x .hidden mul_382x .type mul_382x,@function .align 32 mul_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_382x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 leaq 96(%rdi),%rdi movq %rsi,0(%rsp) movq %rdx,8(%rsp) movq %rdi,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 48(%rsi),%r8 adcq 56(%rsi),%r9 adcq 64(%rsi),%r10 adcq 72(%rsi),%r11 adcq 80(%rsi),%r12 adcq 88(%rsi),%r13 movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq 32(%rdx),%r12 movq 40(%rdx),%r13 addq 48(%rdx),%r8 adcq 56(%rdx),%r9 adcq 64(%rdx),%r10 adcq 72(%rdx),%r11 adcq 80(%rdx),%r12 adcq 88(%rdx),%r13 movq %r8,32+48(%rsp) movq %r9,32+56(%rsp) movq %r10,32+64(%rsp) movq %r11,32+72(%rsp) movq %r12,32+80(%rsp) movq %r13,32+88(%rsp) leaq 32+0(%rsp),%rsi leaq 32+48(%rsp),%rbx call __mulq_384 movq 0(%rsp),%rsi movq 8(%rsp),%rbx leaq -96(%rdi),%rdi call __mulq_384 leaq 48(%rsi),%rsi leaq 48(%rbx),%rbx leaq 32(%rsp),%rdi call __mulq_384 movq 16(%rsp),%rsi leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi call __subq_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx call __subq_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi call __subq_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_382x,.-mul_382x .globl sqr_382x .hidden sqr_382x .type sqr_382x,@function .align 32 sqr_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_382x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 movq %rdx,%rcx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%rbx movq 32(%rsi),%rbp movq 40(%rsi),%rdx movq %r14,%r8 addq 48(%rsi),%r14 movq %r15,%r9 adcq 56(%rsi),%r15 movq %rax,%r10 adcq 64(%rsi),%rax movq %rbx,%r11 adcq 72(%rsi),%rbx movq %rbp,%r12 adcq 80(%rsi),%rbp movq %rdx,%r13 adcq 88(%rsi),%rdx movq %r14,0(%rdi) movq %r15,8(%rdi) movq %rax,16(%rdi) movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rdx,40(%rdi) leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi call __subq_mod_384_a_is_loaded leaq (%rdi),%rsi leaq -48(%rdi),%rbx leaq -48(%rdi),%rdi call __mulq_384 movq (%rsp),%rsi leaq 48(%rsi),%rbx leaq 96(%rdi),%rdi call __mulq_384 movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq 64(%rdi),%rax movq 72(%rdi),%rbx movq 80(%rdi),%rbp addq %r8,%r8 movq 88(%rdi),%rdx adcq %r9,%r9 movq %r8,0(%rdi) adcq %r10,%r10 movq %r9,8(%rdi) adcq %r11,%r11 movq %r10,16(%rdi) adcq %r12,%r12 movq %r11,24(%rdi) adcq %r13,%r13 movq %r12,32(%rdi) adcq %r14,%r14 movq %r13,40(%rdi) adcq %r15,%r15 movq %r14,48(%rdi) adcq %rax,%rax movq %r15,56(%rdi) adcq %rbx,%rbx movq %rax,64(%rdi) adcq %rbp,%rbp movq %rbx,72(%rdi) adcq %rdx,%rdx movq %rbp,80(%rdi) movq %rdx,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -8*7 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_382x,.-sqr_382x .globl mul_384 .hidden mul_384 .type mul_384,@function .align 32 mul_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 movq %rdx,%rbx call __mulq_384 movq 0(%rsp),%r12 .cfi_restore %r12 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_384,.-mul_384 .type __mulq_384,@function .align 32 __mulq_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rbx),%rax movq %rax,%rbp mulq 0(%rsi) movq %rax,0(%rdi) movq %rbp,%rax movq %rdx,%rcx mulq 8(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r11 movq 8(%rbx),%rax adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,8(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 16(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,16(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 24(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,24(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 32(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,32(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 40(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,40(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq %rax,%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rcx,48(%rdi) movq %r8,56(%rdi) movq %r9,64(%rdi) movq %r10,72(%rdi) movq %r11,80(%rdi) movq %r12,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulq_384,.-__mulq_384 .globl sqr_384 .hidden sqr_384 .type sqr_384,@function .align 32 sqr_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 call __sqrq_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_384,.-sqr_384 .type __sqrq_384,@function .align 32 __sqrq_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r15 movq 16(%rsi),%rcx movq 24(%rsi),%rbx movq %rax,%r14 mulq %r15 movq %rax,%r9 movq %r14,%rax movq 32(%rsi),%rbp movq %rdx,%r10 mulq %rcx addq %rax,%r10 movq %r14,%rax adcq $0,%rdx movq 40(%rsi),%rsi movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r14,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r14,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rsi addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rax xorq %r8,%r8 movq %rax,0(%rdi) movq %r15,%rax addq %r9,%r9 adcq $0,%r8 addq %rdx,%r9 adcq $0,%r8 movq %r9,8(%rdi) mulq %rcx addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r9 mulq %rbx addq %rax,%r12 movq %r15,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq %rbp addq %rax,%r13 movq %r15,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq %rsi addq %rax,%r14 movq %r15,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r15 mulq %rax xorq %r9,%r9 addq %rax,%r8 movq %rcx,%rax addq %r10,%r10 adcq %r11,%r11 adcq $0,%r9 addq %r8,%r10 adcq %rdx,%r11 adcq $0,%r9 movq %r10,16(%rdi) mulq %rbx addq %rax,%r13 movq %rcx,%rax adcq $0,%rdx movq %r11,24(%rdi) movq %rdx,%r8 mulq %rbp addq %rax,%r14 movq %rcx,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq %rsi addq %rax,%r15 movq %rcx,%rax adcq $0,%rdx addq %r8,%r15 adcq $0,%rdx movq %rdx,%rcx mulq %rax xorq %r11,%r11 addq %rax,%r9 movq %rbx,%rax addq %r12,%r12 adcq %r13,%r13 adcq $0,%r11 addq %r9,%r12 adcq %rdx,%r13 adcq $0,%r11 movq %r12,32(%rdi) mulq %rbp addq %rax,%r15 movq %rbx,%rax adcq $0,%rdx movq %r13,40(%rdi) movq %rdx,%r8 mulq %rsi addq %rax,%rcx movq %rbx,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%rbx mulq %rax xorq %r12,%r12 addq %rax,%r11 movq %rbp,%rax addq %r14,%r14 adcq %r15,%r15 adcq $0,%r12 addq %r11,%r14 adcq %rdx,%r15 movq %r14,48(%rdi) adcq $0,%r12 movq %r15,56(%rdi) mulq %rsi addq %rax,%rbx movq %rbp,%rax adcq $0,%rdx movq %rdx,%rbp mulq %rax xorq %r13,%r13 addq %rax,%r12 movq %rsi,%rax addq %rcx,%rcx adcq %rbx,%rbx adcq $0,%r13 addq %r12,%rcx adcq %rdx,%rbx movq %rcx,64(%rdi) adcq $0,%r13 movq %rbx,72(%rdi) mulq %rax addq %r13,%rax addq %rbp,%rbp adcq $0,%rdx addq %rbp,%rax adcq $0,%rdx movq %rax,80(%rdi) movq %rdx,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __sqrq_384,.-__sqrq_384 .globl sqr_mont_384 .hidden sqr_mont_384 .type sqr_mont_384,@function .align 32 sqr_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $120,%rsp .cfi_adjust_cfa_offset 8*15 movq %rcx,96(%rsp) movq %rdx,104(%rsp) movq %rdi,112(%rsp) movq %rsp,%rdi call __sqrq_384 leaq 0(%rsp),%rsi movq 96(%rsp),%rcx movq 104(%rsp),%rbx movq 112(%rsp),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 120(%rsp),%r8 movq 120(%rsp),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -8*21 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_mont_384,.-sqr_mont_384 .globl redc_mont_384 .hidden redc_mont_384 .type redc_mont_384,@function .align 32 redc_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz redc_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_384 call __redq_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size redc_mont_384,.-redc_mont_384 .globl from_mont_384 .hidden from_mont_384 .type from_mont_384,@function .align 32 from_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz from_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_384 movq %r15,%rcx movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size from_mont_384,.-from_mont_384 .type __mulq_by_1_mont_384,@function .align 32 __mulq_by_1_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rax,%r14 imulq %rcx,%rax movq %rax,%r8 mulq 0(%rbx) addq %rax,%r14 movq %r8,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %r14,%r9 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %r14,%r10 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %r9,%r15 imulq %rcx,%r9 addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 32(%rbx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 40(%rbx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r9,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r15,%r11 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %r10,%r8 imulq %rcx,%r10 addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 32(%rbx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rbx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r8 movq %r10,%rax adcq %rdx,%r8 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rbx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 mulq 24(%rbx) addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %r11,%r9 imulq %rcx,%r11 addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rbx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rbx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r8,%r15 adcq $0,%rdx movq %rdx,%r8 mulq 0(%rbx) addq %rax,%r9 movq %r11,%rax adcq %rdx,%r9 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rbx) addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %r12,%r10 imulq %rcx,%r12 addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rbx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rbx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 0(%rbx) addq %rax,%r10 movq %r12,%rax adcq %rdx,%r10 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rbx) addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %r13,%r11 imulq %rcx,%r13 addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rbx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r10,%r8 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rbx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 0(%rbx) addq %rax,%r11 movq %r13,%rax adcq %rdx,%r11 mulq 8(%rbx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rbx) addq %rax,%r8 movq %r13,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rbx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r11,%r9 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rbx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 .type __redq_tail_mont_384,@function .align 32 __redq_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 movq %r14,%rax adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 movq %r15,%rcx adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 sbbq %r12,%r12 movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __redq_tail_mont_384,.-__redq_tail_mont_384 .globl sgn0_pty_mont_384 .hidden sgn0_pty_mont_384 .type sgn0_pty_mont_384,@function .align 32 sgn0_pty_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sgn0_pty_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 0(%rdi),%rsi movq %rdx,%rcx call __mulq_by_1_mont_384 xorq %rax,%rax movq %r14,%r13 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax notq %rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sgn0_pty_mont_384,.-sgn0_pty_mont_384 .globl sgn0_pty_mont_384x .hidden sgn0_pty_mont_384x .type sgn0_pty_mont_384x,@function .align 32 sgn0_pty_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sgn0_pty_mont_384x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 48(%rdi),%rsi movq %rdx,%rcx call __mulq_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 leaq 0(%rdi),%rsi xorq %rdi,%rdi movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rdi subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rdi movq %r14,0(%rsp) notq %rdi andq $1,%r13 andq $2,%rdi orq %r13,%rdi call __mulq_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 xorq %rax,%rax movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax movq 0(%rsp),%r12 notq %rax testq %r14,%r14 cmovzq %rdi,%r13 testq %r12,%r12 cmovnzq %rdi,%rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x .globl mul_mont_384 .hidden mul_mont_384 .type mul_mont_384,@function .align 32 mul_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz mul_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $24,%rsp .cfi_adjust_cfa_offset 8*3 movq 0(%rdx),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq %rdx,%rbx movq %r8,0(%rsp) movq %rdi,8(%rsp) call __mulq_mont_384 movq 24(%rsp),%r15 .cfi_restore %r15 movq 32(%rsp),%r14 .cfi_restore %r14 movq 40(%rsp),%r13 .cfi_restore %r13 movq 48(%rsp),%r12 .cfi_restore %r12 movq 56(%rsp),%rbx .cfi_restore %rbx movq 64(%rsp),%rbp .cfi_restore %rbp leaq 72(%rsp),%rsp .cfi_adjust_cfa_offset -72 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mul_mont_384,.-mul_mont_384 .type __mulq_mont_384,@function .align 32 __mulq_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rax,%rdi mulq %r14 movq %rax,%r8 movq %rdi,%rax movq %rdx,%r9 mulq %r15 addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r12 addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r11 movq %r8,%rbp imulq 8(%rsp),%r8 mulq %r13 addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r13 mulq 40(%rsi) addq %rax,%r13 movq %r8,%rax adcq $0,%rdx xorq %r15,%r15 movq %rdx,%r14 mulq 0(%rcx) addq %rax,%rbp movq %r8,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %rbp,%r9 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %rbp,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r13 movq 8(%rbx),%rax adcq $0,%rdx addq %rbp,%r13 adcq %rdx,%r14 adcq $0,%r15 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r8 mulq 8(%rsi) addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx addq %r8,%r10 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 movq %r9,%rbp imulq 8(%rsp),%r9 mulq 24(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rsi) addq %r8,%r14 adcq $0,%rdx xorq %r8,%r8 addq %rax,%r14 movq %r9,%rax adcq %rdx,%r15 adcq $0,%r8 mulq 0(%rcx) addq %rax,%rbp movq %r9,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %rbp,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r14 movq 16(%rbx),%rax adcq $0,%rdx addq %rbp,%r14 adcq %rdx,%r15 adcq $0,%r8 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx addq %r9,%r11 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 movq %r10,%rbp imulq 8(%rsp),%r10 mulq 24(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rsi) addq %r9,%r15 adcq $0,%rdx xorq %r9,%r9 addq %rax,%r15 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 mulq 0(%rcx) addq %rax,%rbp movq %r10,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r15 movq 24(%rbx),%rax adcq $0,%rdx addq %rbp,%r15 adcq %rdx,%r8 adcq $0,%r9 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r10 mulq 8(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r10,%r12 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 movq %r11,%rbp imulq 8(%rsp),%r11 mulq 24(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rsi) addq %r10,%r8 adcq $0,%rdx xorq %r10,%r10 addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 mulq 0(%rcx) addq %rax,%rbp movq %r11,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r14 adcq $0,%rdx addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %rbp,%r15 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r8 movq 32(%rbx),%rax adcq $0,%rdx addq %rbp,%r8 adcq %rdx,%r9 adcq $0,%r10 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r11 mulq 8(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 movq %r12,%rbp imulq 8(%rsp),%r12 mulq 24(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rsi) addq %rax,%r8 movq %rdi,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %r11,%r9 adcq $0,%rdx xorq %r11,%r11 addq %rax,%r9 movq %r12,%rax adcq %rdx,%r10 adcq $0,%r11 mulq 0(%rcx) addq %rax,%rbp movq %r12,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r15 adcq $0,%rdx addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %rbp,%r8 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r9 movq 40(%rbx),%rax adcq $0,%rdx addq %rbp,%r9 adcq %rdx,%r10 adcq $0,%r11 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r12 mulq 8(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r12,%r14 adcq $0,%rdx movq %rdx,%r12 mulq 16(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r12,%r15 adcq $0,%rdx movq %rdx,%r12 movq %r13,%rbp imulq 8(%rsp),%r13 mulq 24(%rsi) addq %rax,%r8 movq %rdi,%rax adcq $0,%rdx addq %r12,%r8 adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx addq %r12,%r9 adcq $0,%rdx movq %rdx,%r12 mulq 40(%rsi) addq %r12,%r10 adcq $0,%rdx xorq %r12,%r12 addq %rax,%r10 movq %r13,%rax adcq %rdx,%r11 adcq $0,%r12 mulq 0(%rcx) addq %rax,%rbp movq %r13,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %rbp,%r15 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r8 adcq $0,%rdx addq %rax,%r8 movq %r13,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %rbp,%r9 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %rbp,%r10 adcq %rdx,%r11 adcq $0,%r12 movq 16(%rsp),%rdi subq 0(%rcx),%r14 movq %r15,%rdx sbbq 8(%rcx),%r15 movq %r8,%rbx sbbq 16(%rcx),%r8 movq %r9,%rsi sbbq 24(%rcx),%r9 movq %r10,%rbp sbbq 32(%rcx),%r10 movq %r11,%r13 sbbq 40(%rcx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rdx,%r15 cmovcq %rbx,%r8 movq %r14,0(%rdi) cmovcq %rsi,%r9 movq %r15,8(%rdi) cmovcq %rbp,%r10 movq %r8,16(%rdi) cmovcq %r13,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulq_mont_384,.-__mulq_mont_384 .globl sqr_n_mul_mont_384 .hidden sqr_n_mul_mont_384 .type sqr_n_mul_mont_384,@function .align 32 sqr_n_mul_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_n_mul_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 8*17 movq %r8,0(%rsp) movq %rdi,8(%rsp) movq %rcx,16(%rsp) leaq 32(%rsp),%rdi movq %r9,24(%rsp) movq (%r9),%xmm2 .Loop_sqr_384: movd %edx,%xmm1 call __sqrq_384 leaq 0(%rdi),%rsi movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 call __redq_tail_mont_384 movd %xmm1,%edx leaq 0(%rdi),%rsi decl %edx jnz .Loop_sqr_384 .byte 102,72,15,126,208 movq %rbx,%rcx movq 24(%rsp),%rbx movq %r8,%r12 movq %r9,%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 136(%rsp),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -8*23 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 .globl sqr_n_mul_mont_383 .hidden sqr_n_mul_mont_383 .type sqr_n_mul_mont_383,@function .align 32 sqr_n_mul_mont_383: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_n_mul_mont_383$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 8*17 movq %r8,0(%rsp) movq %rdi,8(%rsp) movq %rcx,16(%rsp) leaq 32(%rsp),%rdi movq %r9,24(%rsp) movq (%r9),%xmm2 .Loop_sqr_383: movd %edx,%xmm1 call __sqrq_384 leaq 0(%rdi),%rsi movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 movd %xmm1,%edx addq 48(%rsi),%r14 adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 leaq 0(%rdi),%rsi movq %r14,0(%rdi) movq %r15,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) decl %edx jnz .Loop_sqr_383 .byte 102,72,15,126,208 movq %rbx,%rcx movq 24(%rsp),%rbx movq %r8,%r12 movq %r9,%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 136(%rsp),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -8*23 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 .type __mulq_mont_383_nonred,@function .align 32 __mulq_mont_383_nonred: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rax,%rbp mulq %r14 movq %rax,%r8 movq %rbp,%rax movq %rdx,%r9 mulq %r15 addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r12 addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 movq %r8,%r15 imulq 8(%rsp),%r8 mulq %r13 addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r13 mulq 40(%rsi) addq %rax,%r13 movq %r8,%rax adcq $0,%rdx movq %rdx,%r14 mulq 0(%rcx) addq %rax,%r15 movq %r8,%rax adcq %rdx,%r15 mulq 8(%rcx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %r15,%r9 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rcx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rcx) addq %r15,%r11 adcq $0,%rdx addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%r15 mulq 32(%rcx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rcx) addq %rax,%r13 movq 8(%rbx),%rax adcq $0,%rdx addq %r15,%r13 adcq %rdx,%r14 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r15 mulq 8(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r15,%r11 adcq $0,%rdx movq %rdx,%r15 movq %r9,%r8 imulq 8(%rsp),%r9 mulq 24(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 32(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rsi) addq %r15,%r14 adcq $0,%rdx addq %rax,%r14 movq %r9,%rax adcq $0,%rdx movq %rdx,%r15 mulq 0(%rcx) addq %rax,%r8 movq %r9,%rax adcq %rdx,%r8 mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r8,%r10 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 24(%rcx) addq %r8,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %rdx,%r8 mulq 32(%rcx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rcx) addq %rax,%r14 movq 16(%rbx),%rax adcq $0,%rdx addq %r8,%r14 adcq %rdx,%r15 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r8 mulq 8(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 movq %r10,%r9 imulq 8(%rsp),%r10 mulq 24(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rsi) addq %r8,%r15 adcq $0,%rdx addq %rax,%r15 movq %r10,%rax adcq $0,%rdx movq %rdx,%r8 mulq 0(%rcx) addq %rax,%r9 movq %r10,%rax adcq %rdx,%r9 mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r9,%r11 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rcx) addq %r9,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %rdx,%r9 mulq 32(%rcx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rcx) addq %rax,%r15 movq 24(%rbx),%rax adcq $0,%rdx addq %r9,%r15 adcq %rdx,%r8 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 movq %r11,%r10 imulq 8(%rsp),%r11 mulq 24(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rsi) addq %r9,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq $0,%rdx movq %rdx,%r9 mulq 0(%rcx) addq %rax,%r10 movq %r11,%rax adcq %rdx,%r10 mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r10,%r12 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rcx) addq %r10,%r14 adcq $0,%rdx addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %rdx,%r10 mulq 32(%rcx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rcx) addq %rax,%r8 movq 32(%rbx),%rax adcq $0,%rdx addq %r10,%r8 adcq %rdx,%r9 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq 8(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 movq %r12,%r11 imulq 8(%rsp),%r12 mulq 24(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r10,%r8 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rsi) addq %r10,%r9 adcq $0,%rdx addq %rax,%r9 movq %r12,%rax adcq $0,%rdx movq %rdx,%r10 mulq 0(%rcx) addq %rax,%r11 movq %r12,%rax adcq %rdx,%r11 mulq 8(%rcx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rcx) addq %r11,%r15 adcq $0,%rdx addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %rdx,%r11 mulq 32(%rcx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rcx) addq %rax,%r9 movq 40(%rbx),%rax adcq $0,%rdx addq %r11,%r9 adcq %rdx,%r10 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq 8(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 movq %r13,%r12 imulq 8(%rsp),%r13 mulq 24(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r11,%r9 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %r11,%r10 adcq $0,%rdx addq %rax,%r10 movq %r13,%rax adcq $0,%rdx movq %rdx,%r11 mulq 0(%rcx) addq %rax,%r12 movq %r13,%rax adcq %rdx,%r12 mulq 8(%rcx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r12,%r14 adcq $0,%rdx movq %rdx,%r12 mulq 16(%rcx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r12,%r15 adcq $0,%rdx movq %rdx,%r12 mulq 24(%rcx) addq %r12,%r8 adcq $0,%rdx addq %rax,%r8 movq %r13,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rcx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r12,%r9 adcq $0,%rdx movq %rdx,%r12 mulq 40(%rcx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r12,%r10 adcq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred .globl sqr_mont_382x .hidden sqr_mont_382x .type sqr_mont_382x,@function .align 32 sqr_mont_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,__blst_platform_cap(%rip) jnz sqr_mont_382x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rsi,16(%rsp) movq %rdi,24(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rdx adcq 72(%rsi),%r11 movq %r12,%rbx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rdx sbbq 80(%rsi),%rbx sbbq 88(%rsi),%rbp sbbq %rdi,%rdi movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq %r14,32+48(%rsp) movq %r15,32+56(%rsp) movq %rax,32+64(%rsp) movq %rdx,32+72(%rsp) movq %rbx,32+80(%rsp) movq %rbp,32+88(%rsp) movq %rdi,32+96(%rsp) leaq 48(%rsi),%rbx movq 48(%rsi),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq 24(%rsp),%rdi call __mulq_mont_383_nonred addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 movq %r14,48(%rdi) movq %r15,56(%rdi) movq %r8,64(%rdi) movq %r9,72(%rdi) movq %r10,80(%rdi) movq %r11,88(%rdi) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rax movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%r12 movq 32+24(%rsp),%r13 call __mulq_mont_383_nonred movq 32+96(%rsp),%rsi movq 32+0(%rsp),%r12 movq 32+8(%rsp),%r13 andq %rsi,%r12 movq 32+16(%rsp),%rax andq %rsi,%r13 movq 32+24(%rsp),%rbx andq %rsi,%rax movq 32+32(%rsp),%rbp andq %rsi,%rbx andq %rsi,%rbp andq 32+40(%rsp),%rsi subq %r12,%r14 movq 0(%rcx),%r12 sbbq %r13,%r15 movq 8(%rcx),%r13 sbbq %rax,%r8 movq 16(%rcx),%rax sbbq %rbx,%r9 movq 24(%rcx),%rbx sbbq %rbp,%r10 movq 32(%rcx),%rbp sbbq %rsi,%r11 sbbq %rsi,%rsi andq %rsi,%r12 andq %rsi,%r13 andq %rsi,%rax andq %rsi,%rbx andq %rsi,%rbp andq 40(%rcx),%rsi addq %r12,%r14 adcq %r13,%r15 adcq %rax,%r8 adcq %rbx,%r9 adcq %rbp,%r10 adcq %rsi,%r11 movq %r14,0(%rdi) movq %r15,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqr_mont_382x,.-sqr_mont_382x .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/mulx_mont_256-x86_64.s ================================================ .text .globl mulx_mont_sparse_256 .hidden mulx_mont_sparse_256 .type mulx_mont_sparse_256,@function .align 32 mulx_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mul_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rbp movq 24(%rsi),%r9 leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%rax,%r11 call __mulx_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mulx_mont_sparse_256,.-mulx_mont_sparse_256 .globl sqrx_mont_sparse_256 .hidden sqrx_mont_sparse_256 .type sqrx_mont_sparse_256,@function .align 32 sqrx_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx movq %rcx,%r8 movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rbp movq 24(%rsi),%r9 leaq -128(%rbx),%rsi leaq -128(%rcx),%rcx mulxq %rdx,%rax,%r11 call __mulx_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 .type __mulx_mont_sparse_256,@function .align 32 __mulx_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r15,%r12 mulxq %rbp,%rbp,%r13 addq %r15,%r11 mulxq %r9,%r9,%r14 movq 8(%rbx),%rdx adcq %rbp,%r12 adcq %r9,%r13 adcq $0,%r14 movq %rax,%r10 imulq %r8,%rax xorq %r15,%r15 mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r11 adcxq %r9,%r12 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r12 adcxq %r9,%r13 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r14 adcxq %r15,%r9 adoxq %r9,%r15 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r10 adoxq %r11,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r12 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r12 adoxq %r9,%r13 mulxq 24+128(%rcx),%rbp,%r9 movq 16(%rbx),%rdx adcxq %rbp,%r13 adoxq %r9,%r14 adcxq %r10,%r14 adoxq %r10,%r15 adcxq %r10,%r15 adoxq %r10,%r10 adcq $0,%r10 movq %rax,%r11 imulq %r8,%rax xorq %rbp,%rbp mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r12 adcxq %r9,%r13 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r14 adcxq %r9,%r15 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r15 adcxq %r10,%r9 adoxq %r9,%r10 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r11 adoxq %r12,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r13 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r13 adoxq %r9,%r14 mulxq 24+128(%rcx),%rbp,%r9 movq 24(%rbx),%rdx adcxq %rbp,%r14 adoxq %r9,%r15 adcxq %r11,%r15 adoxq %r11,%r10 adcxq %r11,%r10 adoxq %r11,%r11 adcq $0,%r11 movq %rax,%r12 imulq %r8,%rax xorq %rbp,%rbp mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r14 adcxq %r9,%r15 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r15 adcxq %r9,%r10 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r10 adcxq %r11,%r9 adoxq %r9,%r11 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r12 adoxq %r13,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r14 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r14 adoxq %r9,%r15 mulxq 24+128(%rcx),%rbp,%r9 movq %rax,%rdx adcxq %rbp,%r15 adoxq %r9,%r10 adcxq %r12,%r10 adoxq %r12,%r11 adcxq %r12,%r11 adoxq %r12,%r12 adcq $0,%r12 imulq %r8,%rdx xorq %rbp,%rbp mulxq 0+128(%rcx),%r13,%r9 adcxq %rax,%r13 adoxq %r9,%r14 mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%r14 adoxq %r9,%r15 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r15 adoxq %r9,%r10 mulxq 24+128(%rcx),%rbp,%r9 movq %r14,%rdx leaq 128(%rcx),%rcx adcxq %rbp,%r10 adoxq %r9,%r11 movq %r15,%rax adcxq %r13,%r11 adoxq %r13,%r12 adcq $0,%r12 movq %r10,%rbp subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 sbbq 16(%rcx),%r10 movq %r11,%r9 sbbq 24(%rcx),%r11 sbbq $0,%r12 cmovcq %rdx,%r14 cmovcq %rax,%r15 cmovcq %rbp,%r10 movq %r14,0(%rdi) cmovcq %r9,%r11 movq %r15,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 .globl fromx_mont_256 .hidden fromx_mont_256 .type fromx_mont_256,@function .align 32 fromx_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa from_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulx_by_1_mont_256 movq %r15,%rdx movq %r10,%r12 movq %r11,%r13 subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r10 sbbq 24(%rbx),%r11 cmovncq %r14,%rax cmovncq %r15,%rdx cmovncq %r10,%r12 movq %rax,0(%rdi) cmovncq %r11,%r13 movq %rdx,8(%rdi) movq %r12,16(%rdi) movq %r13,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size fromx_mont_256,.-fromx_mont_256 .globl redcx_mont_256 .hidden redcx_mont_256 .type redcx_mont_256,@function .align 32 redcx_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa redc_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulx_by_1_mont_256 addq 32(%rsi),%r14 adcq 40(%rsi),%r15 movq %r14,%rax adcq 48(%rsi),%r10 movq %r15,%rdx adcq 56(%rsi),%r11 sbbq %rsi,%rsi movq %r10,%r12 subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r10 movq %r11,%r13 sbbq 24(%rbx),%r11 sbbq $0,%rsi cmovncq %r14,%rax cmovncq %r15,%rdx cmovncq %r10,%r12 movq %rax,0(%rdi) cmovncq %r11,%r13 movq %rdx,8(%rdi) movq %r12,16(%rdi) movq %r13,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size redcx_mont_256,.-redcx_mont_256 .type __mulx_by_1_mont_256,@function .align 32 __mulx_by_1_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rax movq 8(%rsi),%r11 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq %rax,%r14 imulq %rcx,%rax movq %rax,%r10 mulq 0(%rbx) addq %rax,%r14 movq %r10,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) movq %r11,%r15 imulq %rcx,%r11 addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r11,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) movq %r12,%r10 imulq %rcx,%r12 addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r10 movq %r12,%rax adcq %rdx,%r10 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rbx) movq %r13,%r11 imulq %rcx,%r13 addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 0(%rbx) addq %rax,%r11 movq %r13,%rax adcq %rdx,%r11 mulq 8(%rbx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rbx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/mulx_mont_384-x86_64.s ================================================ .text .type __subx_mod_384x384,@function .align 32 __subx_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __subx_mod_384x384,.-__subx_mod_384x384 .type __addx_mod_384,@function .align 32 __addx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __addx_mod_384,.-__addx_mod_384 .type __subx_mod_384,@function .align 32 __subx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __subx_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __subx_mod_384,.-__subx_mod_384 .globl mulx_mont_384x .hidden mulx_mont_384x .type mulx_mont_384x,@function .align 32 mulx_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mul_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $328,%rsp .cfi_adjust_cfa_offset 328 movq %rdx,%rbx movq %rdi,32(%rsp) movq %rsi,24(%rsp) movq %rdx,16(%rsp) movq %rcx,8(%rsp) movq %r8,0(%rsp) leaq 40(%rsp),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 leaq 48(%rbx),%rbx leaq 128+48(%rsi),%rsi leaq 96(%rdi),%rdi call __mulx_384 movq 8(%rsp),%rcx leaq (%rbx),%rsi leaq -48(%rbx),%rdx leaq 40+192+48(%rsp),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi call __mulx_384 leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx call __subx_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi call __subx_mod_384x384 leaq (%rcx),%rbx leaq 40(%rsp),%rsi movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulx_by_1_mont_384 call __redx_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulx_by_1_mont_384 call __redx_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -328-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mulx_mont_384x,.-mulx_mont_384x .globl sqrx_mont_384x .hidden sqrx_mont_384x .type sqrx_mont_384x,@function .align 32 sqrx_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,16(%rsp) movq %rsi,24(%rsp) leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi call __subx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rsi),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 addq %rdx,%rdx adcq %r15,%r15 adcq %rax,%rax movq %rdx,%r8 adcq %r12,%r12 movq %r15,%r9 adcq %rdi,%rdi movq %rax,%r10 adcq %rbp,%rbp movq %r12,%r11 sbbq %rsi,%rsi subq 0(%rcx),%rdx sbbq 8(%rcx),%r15 movq %rdi,%r13 sbbq 16(%rcx),%rax sbbq 24(%rcx),%r12 sbbq 32(%rcx),%rdi movq %rbp,%r14 sbbq 40(%rcx),%rbp sbbq $0,%rsi cmovcq %r8,%rdx cmovcq %r9,%r15 cmovcq %r10,%rax movq %rdx,48(%rbx) cmovcq %r11,%r12 movq %r15,56(%rbx) cmovcq %r13,%rdi movq %rax,64(%rbx) cmovcq %r14,%rbp movq %r12,72(%rbx) movq %rdi,80(%rbx) movq %rbp,88(%rbx) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rdx movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%rax movq 32+24(%rsp),%r12 movq 32+32(%rsp),%rdi movq 32+40(%rsp),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_mont_384x,.-sqrx_mont_384x .globl mulx_382x .hidden mulx_382x .type mulx_382x,@function .align 32 mulx_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mul_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 leaq 96(%rdi),%rdi movq %rsi,0(%rsp) movq %rdx,8(%rsp) movq %rdi,16(%rsp) movq %rcx,24(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 48(%rsi),%r8 adcq 56(%rsi),%r9 adcq 64(%rsi),%r10 adcq 72(%rsi),%r11 adcq 80(%rsi),%r12 adcq 88(%rsi),%r13 movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq 32(%rdx),%r12 movq 40(%rdx),%r13 addq 48(%rdx),%r8 adcq 56(%rdx),%r9 adcq 64(%rdx),%r10 adcq 72(%rdx),%r11 adcq 80(%rdx),%r12 adcq 88(%rdx),%r13 movq %r8,32+48(%rsp) movq %r9,32+56(%rsp) movq %r10,32+64(%rsp) movq %r11,32+72(%rsp) movq %r12,32+80(%rsp) movq %r13,32+88(%rsp) leaq 32+0(%rsp),%rsi leaq 32+48(%rsp),%rbx call __mulx_384 movq 0(%rsp),%rsi movq 8(%rsp),%rbx leaq -96(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 leaq 48+128(%rsi),%rsi leaq 48(%rbx),%rbx leaq 32(%rsp),%rdi call __mulx_384 movq 16(%rsp),%rsi leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx call __subx_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi call __subx_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mulx_382x,.-mulx_382x .globl sqrx_382x .hidden sqrx_382x .type sqrx_382x,@function .align 32 sqrx_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%rbx movq 32(%rsi),%rbp movq 40(%rsi),%rdx movq %r14,%r8 addq 48(%rsi),%r14 movq %r15,%r9 adcq 56(%rsi),%r15 movq %rax,%r10 adcq 64(%rsi),%rax movq %rbx,%r11 adcq 72(%rsi),%rbx movq %rbp,%r12 adcq 80(%rsi),%rbp movq %rdx,%r13 adcq 88(%rsi),%rdx movq %r14,0(%rdi) movq %r15,8(%rdi) movq %rax,16(%rdi) movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rdx,40(%rdi) leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi call __subx_mod_384_a_is_loaded leaq (%rdi),%rsi leaq -48(%rdi),%rbx leaq -48(%rdi),%rdi call __mulx_384 movq (%rsp),%rsi leaq 48(%rsi),%rbx leaq 96(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq 64(%rdi),%rax movq 72(%rdi),%rbx movq 80(%rdi),%rbp addq %r8,%r8 movq 88(%rdi),%rdx adcq %r9,%r9 movq %r8,0(%rdi) adcq %r10,%r10 movq %r9,8(%rdi) adcq %r11,%r11 movq %r10,16(%rdi) adcq %r12,%r12 movq %r11,24(%rdi) adcq %r13,%r13 movq %r12,32(%rdi) adcq %r14,%r14 movq %r13,40(%rdi) adcq %r15,%r15 movq %r14,48(%rdi) adcq %rax,%rax movq %r15,56(%rdi) adcq %rbx,%rbx movq %rax,64(%rdi) adcq %rbp,%rbp movq %rbx,72(%rdi) adcq %rdx,%rdx movq %rbp,80(%rdi) movq %rdx,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -8*7 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_382x,.-sqrx_382x .globl mulx_384 .hidden mulx_384 .type mulx_384,@function .align 32 mulx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mul_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mulx_384,.-mulx_384 .type __mulx_384,@function .align 32 __mulx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rbx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 leaq -128(%rsi),%rsi mulxq %r14,%r9,%rcx xorq %rbp,%rbp mulxq %r15,%r8,%rax adcxq %rcx,%r8 movq %r9,0(%rdi) mulxq %r10,%r9,%rcx adcxq %rax,%r9 mulxq %r11,%r10,%rax adcxq %rcx,%r10 mulxq %r12,%r11,%rcx adcxq %rax,%r11 mulxq %r13,%r12,%r13 movq 8(%rbx),%rdx adcxq %rcx,%r12 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,8(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 16(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,16(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 24(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,24(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 32(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,32(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 40(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,40(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq %rax,%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 movq %r8,48(%rdi) movq %r9,56(%rdi) movq %r10,64(%rdi) movq %r11,72(%rdi) movq %r12,80(%rdi) movq %r13,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulx_384,.-__mulx_384 .globl sqrx_384 .hidden sqrx_384 .type sqrx_384,@function .align 32 sqrx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif call __sqrx_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_384,.-sqrx_384 .type __sqrx_384,@function .align 32 __sqrx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rdx movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%rcx movq 32(%rsi),%rbx mulxq %r14,%r8,%rdi movq 40(%rsi),%rbp mulxq %r15,%r9,%rax addq %rdi,%r9 mulxq %rcx,%r10,%rdi adcq %rax,%r10 mulxq %rbx,%r11,%rax adcq %rdi,%r11 mulxq %rbp,%r12,%r13 movq %r14,%rdx adcq %rax,%r12 adcq $0,%r13 xorq %r14,%r14 mulxq %r15,%rdi,%rax adcxq %rdi,%r10 adoxq %rax,%r11 mulxq %rcx,%rdi,%rax adcxq %rdi,%r11 adoxq %rax,%r12 mulxq %rbx,%rdi,%rax adcxq %rdi,%r12 adoxq %rax,%r13 mulxq %rbp,%rdi,%rax movq %r15,%rdx adcxq %rdi,%r13 adoxq %r14,%rax adcxq %rax,%r14 xorq %r15,%r15 mulxq %rcx,%rdi,%rax adcxq %rdi,%r12 adoxq %rax,%r13 mulxq %rbx,%rdi,%rax adcxq %rdi,%r13 adoxq %rax,%r14 mulxq %rbp,%rdi,%rax movq %rcx,%rdx adcxq %rdi,%r14 adoxq %r15,%rax adcxq %rax,%r15 xorq %rcx,%rcx mulxq %rbx,%rdi,%rax adcxq %rdi,%r14 adoxq %rax,%r15 mulxq %rbp,%rdi,%rax movq %rbx,%rdx adcxq %rdi,%r15 adoxq %rcx,%rax adcxq %rax,%rcx mulxq %rbp,%rdi,%rbx movq 0(%rsi),%rdx addq %rdi,%rcx movq 8(%rsp),%rdi adcq $0,%rbx xorq %rbp,%rbp adcxq %r8,%r8 adcxq %r9,%r9 adcxq %r10,%r10 adcxq %r11,%r11 adcxq %r12,%r12 mulxq %rdx,%rdx,%rax movq %rdx,0(%rdi) movq 8(%rsi),%rdx adoxq %rax,%r8 movq %r8,8(%rdi) mulxq %rdx,%r8,%rax movq 16(%rsi),%rdx adoxq %r8,%r9 adoxq %rax,%r10 movq %r9,16(%rdi) movq %r10,24(%rdi) mulxq %rdx,%r8,%r9 movq 24(%rsi),%rdx adoxq %r8,%r11 adoxq %r9,%r12 adcxq %r13,%r13 adcxq %r14,%r14 movq %r11,32(%rdi) movq %r12,40(%rdi) mulxq %rdx,%r8,%r9 movq 32(%rsi),%rdx adoxq %r8,%r13 adoxq %r9,%r14 adcxq %r15,%r15 adcxq %rcx,%rcx movq %r13,48(%rdi) movq %r14,56(%rdi) mulxq %rdx,%r8,%r9 movq 40(%rsi),%rdx adoxq %r8,%r15 adoxq %r9,%rcx adcxq %rbx,%rbx adcxq %rbp,%rbp movq %r15,64(%rdi) movq %rcx,72(%rdi) mulxq %rdx,%r8,%r9 adoxq %r8,%rbx adoxq %r9,%rbp movq %rbx,80(%rdi) movq %rbp,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __sqrx_384,.-__sqrx_384 .globl redcx_mont_384 .hidden redcx_mont_384 .type redcx_mont_384,@function .align 32 redcx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa redc_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 call __redx_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size redcx_mont_384,.-redcx_mont_384 .globl fromx_mont_384 .hidden fromx_mont_384 .type fromx_mont_384,@function .align 32 fromx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa from_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 movq %r14,%rax movq %r15,%rcx movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size fromx_mont_384,.-fromx_mont_384 .type __mulx_by_1_mont_384,@function .align 32 __mulx_by_1_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq %rcx,%rdx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 imulq %r8,%rdx xorq %r14,%r14 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r13 adoxq %r14,%rbp adcxq %rbp,%r14 imulq %r9,%rdx xorq %r15,%r15 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r14 adoxq %r15,%rbp adcxq %rbp,%r15 imulq %r10,%rdx xorq %r8,%r8 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r15 adoxq %r8,%rbp adcxq %rbp,%r8 imulq %r11,%rdx xorq %r9,%r9 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r8 adoxq %r9,%rbp adcxq %rbp,%r9 imulq %r12,%rdx xorq %r10,%r10 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r9 adoxq %r10,%rbp adcxq %rbp,%r10 imulq %r13,%rdx xorq %r11,%r11 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r10 adoxq %r11,%rbp adcxq %rbp,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 .type __redx_tail_mont_384,@function .align 32 __redx_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 movq %r14,%rax adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 movq %r15,%rcx adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 sbbq %r12,%r12 movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __redx_tail_mont_384,.-__redx_tail_mont_384 .globl sgn0x_pty_mont_384 .hidden sgn0x_pty_mont_384 .type sgn0x_pty_mont_384,@function .align 32 sgn0x_pty_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sgn0_pty_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 0(%rdi),%rsi movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 xorq %rax,%rax movq %r14,%r13 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax notq %rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 .globl sgn0x_pty_mont_384x .hidden sgn0x_pty_mont_384x .type sgn0x_pty_mont_384x,@function .align 32 sgn0x_pty_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sgn0_pty_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 48(%rdi),%rsi movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 leaq 0(%rdi),%rsi xorq %rdi,%rdi movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rdi subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rdi movq %r14,0(%rsp) notq %rdi andq $1,%r13 andq $2,%rdi orq %r13,%rdi call __mulx_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 xorq %rax,%rax movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax movq 0(%rsp),%r12 notq %rax testq %r14,%r14 cmovzq %rdi,%r13 testq %r12,%r12 cmovnzq %rdi,%rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x .globl mulx_mont_384 .hidden mulx_mont_384 .type mulx_mont_384,@function .align 32 mulx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -24(%rsp),%rsp .cfi_adjust_cfa_offset 8*3 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx movq %r8,(%rsp) mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 24(%rsp),%r15 .cfi_restore %r15 movq 32(%rsp),%r14 .cfi_restore %r14 movq 40(%rsp),%r13 .cfi_restore %r13 movq 48(%rsp),%r12 .cfi_restore %r12 movq 56(%rsp),%rbx .cfi_restore %rbx movq 64(%rsp),%rbp .cfi_restore %rbp leaq 72(%rsp),%rsp .cfi_adjust_cfa_offset -8*9 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size mulx_mont_384,.-mulx_mont_384 .type __mulx_mont_384,@function .align 32 __mulx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r14,%r10 mulxq %rax,%r15,%r11 addq %r14,%r9 mulxq %r12,%rax,%r12 adcq %r15,%r10 mulxq %rdi,%rdi,%r13 adcq %rax,%r11 mulxq %rbp,%rbp,%r14 movq 8(%rbx),%rdx adcq %rdi,%r12 adcq %rbp,%r13 adcq $0,%r14 xorq %r15,%r15 movq %r8,16(%rsp) imulq 8(%rsp),%r8 xorq %rax,%rax mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r9 adcxq %rbp,%r10 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 40+128(%rsi),%rdi,%rbp movq %r8,%rdx adoxq %rdi,%r14 adcxq %rbp,%r15 adoxq %rax,%r15 adoxq %rax,%rax xorq %r8,%r8 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r9 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 40+128(%rcx),%rdi,%rbp movq 16(%rbx),%rdx adcxq %rdi,%r13 adoxq %rbp,%r14 adcxq %r8,%r14 adoxq %r8,%r15 adcxq %r8,%r15 adoxq %r8,%rax adcxq %r8,%rax movq %r9,16(%rsp) imulq 8(%rsp),%r9 xorq %r8,%r8 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 40+128(%rsi),%rdi,%rbp movq %r9,%rdx adoxq %rdi,%r15 adcxq %rbp,%rax adoxq %r8,%rax adoxq %r8,%r8 xorq %r9,%r9 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r10 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 40+128(%rcx),%rdi,%rbp movq 24(%rbx),%rdx adcxq %rdi,%r14 adoxq %rbp,%r15 adcxq %r9,%r15 adoxq %r9,%rax adcxq %r9,%rax adoxq %r9,%r8 adcxq %r9,%r8 movq %r10,16(%rsp) imulq 8(%rsp),%r10 xorq %r9,%r9 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 40+128(%rsi),%rdi,%rbp movq %r10,%rdx adoxq %rdi,%rax adcxq %rbp,%r8 adoxq %r9,%r8 adoxq %r9,%r9 xorq %r10,%r10 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r11 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 40+128(%rcx),%rdi,%rbp movq 32(%rbx),%rdx adcxq %rdi,%r15 adoxq %rbp,%rax adcxq %r10,%rax adoxq %r10,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcxq %r10,%r9 movq %r11,16(%rsp) imulq 8(%rsp),%r11 xorq %r10,%r10 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 40+128(%rsi),%rdi,%rbp movq %r11,%rdx adoxq %rdi,%r8 adcxq %rbp,%r9 adoxq %r10,%r9 adoxq %r10,%r10 xorq %r11,%r11 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r12 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 40+128(%rcx),%rdi,%rbp movq 40(%rbx),%rdx adcxq %rdi,%rax adoxq %rbp,%r8 adcxq %r11,%r8 adoxq %r11,%r9 adcxq %r11,%r9 adoxq %r11,%r10 adcxq %r11,%r10 movq %r12,16(%rsp) imulq 8(%rsp),%r12 xorq %r11,%r11 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r8 adcxq %rbp,%r9 mulxq 40+128(%rsi),%rdi,%rbp movq %r12,%rdx adoxq %rdi,%r9 adcxq %rbp,%r10 adoxq %r11,%r10 adoxq %r11,%r11 xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r13 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 40+128(%rcx),%rdi,%rbp movq %r13,%rdx adcxq %rdi,%r8 adoxq %rbp,%r9 adcxq %r12,%r9 adoxq %r12,%r10 adcxq %r12,%r10 adoxq %r12,%r11 adcxq %r12,%r11 imulq 8(%rsp),%rdx movq 24(%rsp),%rbx xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 movq %r15,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r9 movq %rax,%rsi mulxq 40+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 movq %r14,%rdx adcxq %r12,%r10 adoxq %r12,%r11 leaq 128(%rcx),%rcx movq %r8,%r12 adcq $0,%r11 subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %r9,%rdi sbbq 16(%rcx),%rax sbbq 24(%rcx),%r8 sbbq 32(%rcx),%r9 movq %r10,%rbp sbbq 40(%rcx),%r10 sbbq $0,%r11 cmovncq %r14,%rdx cmovcq %r13,%r15 cmovcq %rsi,%rax cmovncq %r8,%r12 movq %rdx,0(%rbx) cmovncq %r9,%rdi movq %r15,8(%rbx) cmovncq %r10,%rbp movq %rax,16(%rbx) movq %r12,24(%rbx) movq %rdi,32(%rbx) movq %rbp,40(%rbx) #ifdef __SGX_LVI_HARDENING__ popq %rsi lfence jmpq *%rsi ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulx_mont_384,.-__mulx_mont_384 .globl sqrx_mont_384 .hidden sqrx_mont_384 .type sqrx_mont_384,@function .align 32 sqrx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -24(%rsp),%rsp .cfi_adjust_cfa_offset 8*3 movq %rcx,%r8 leaq -128(%rdx),%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq (%rsi),%rbx movq %r8,(%rsp) leaq -128(%rsi),%rsi mulxq %rdx,%r8,%r9 call __mulx_mont_384 movq 24(%rsp),%r15 .cfi_restore %r15 movq 32(%rsp),%r14 .cfi_restore %r14 movq 40(%rsp),%r13 .cfi_restore %r13 movq 48(%rsp),%r12 .cfi_restore %r12 movq 56(%rsp),%rbx .cfi_restore %rbx movq 64(%rsp),%rbp .cfi_restore %rbp leaq 72(%rsp),%rsp .cfi_adjust_cfa_offset -8*9 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_mont_384,.-sqrx_mont_384 .globl sqrx_n_mul_mont_384 .hidden sqrx_n_mul_mont_384 .type sqrx_n_mul_mont_384,@function .align 32 sqrx_n_mul_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_n_mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -40(%rsp),%rsp .cfi_adjust_cfa_offset 8*5 movq %rdx,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq %rsi,%rbx movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp movq %r8,(%rsp) movq %r9,24(%rsp) movq 0(%r9),%xmm2 .Loop_sqrx_384: movd %r10d,%xmm1 leaq -128(%rbx),%rsi leaq -128(%rcx),%rcx mulxq %rdx,%r8,%r9 call __mulx_mont_384 movd %xmm1,%r10d decl %r10d jnz .Loop_sqrx_384 movq %rdx,%r14 .byte 102,72,15,126,210 leaq -128(%rbx),%rsi movq 24(%rsp),%rbx leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 40(%rsp),%r15 .cfi_restore %r15 movq 48(%rsp),%r14 .cfi_restore %r14 movq 56(%rsp),%r13 .cfi_restore %r13 movq 64(%rsp),%r12 .cfi_restore %r12 movq 72(%rsp),%rbx .cfi_restore %rbx movq 80(%rsp),%rbp .cfi_restore %rbp leaq 88(%rsp),%rsp .cfi_adjust_cfa_offset -8*11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 .globl sqrx_n_mul_mont_383 .hidden sqrx_n_mul_mont_383 .type sqrx_n_mul_mont_383,@function .align 32 sqrx_n_mul_mont_383: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_n_mul_mont_383$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -40(%rsp),%rsp .cfi_adjust_cfa_offset 8*5 movq %rdx,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq %rsi,%rbx movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp movq %r8,(%rsp) movq %r9,24(%rsp) movq 0(%r9),%xmm2 leaq -128(%rcx),%rcx .Loop_sqrx_383: movd %r10d,%xmm1 leaq -128(%rbx),%rsi mulxq %rdx,%r8,%r9 call __mulx_mont_383_nonred movd %xmm1,%r10d decl %r10d jnz .Loop_sqrx_383 movq %rdx,%r14 .byte 102,72,15,126,210 leaq -128(%rbx),%rsi movq 24(%rsp),%rbx mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 40(%rsp),%r15 .cfi_restore %r15 movq 48(%rsp),%r14 .cfi_restore %r14 movq 56(%rsp),%r13 .cfi_restore %r13 movq 64(%rsp),%r12 .cfi_restore %r12 movq 72(%rsp),%rbx .cfi_restore %rbx movq 80(%rsp),%rbp .cfi_restore %rbp leaq 88(%rsp),%rsp .cfi_adjust_cfa_offset -8*11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 .type __mulx_mont_383_nonred,@function .align 32 __mulx_mont_383_nonred: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r14,%r10 mulxq %rax,%r15,%r11 addq %r14,%r9 mulxq %r12,%rax,%r12 adcq %r15,%r10 mulxq %rdi,%rdi,%r13 adcq %rax,%r11 mulxq %rbp,%rbp,%r14 movq 8(%rbx),%rdx adcq %rdi,%r12 adcq %rbp,%r13 adcq $0,%r14 movq %r8,%rax imulq 8(%rsp),%r8 xorq %r15,%r15 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r9 adcxq %rbp,%r10 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 40+128(%rsi),%rdi,%rbp movq %r8,%rdx adoxq %rdi,%r14 adcxq %r15,%rbp adoxq %rbp,%r15 xorq %r8,%r8 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r9 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 40+128(%rcx),%rdi,%rbp movq 16(%rbx),%rdx adcxq %rdi,%r13 adoxq %rbp,%r14 adcxq %rax,%r14 adoxq %rax,%r15 adcxq %rax,%r15 movq %r9,%r8 imulq 8(%rsp),%r9 xorq %rax,%rax mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 40+128(%rsi),%rdi,%rbp movq %r9,%rdx adoxq %rdi,%r15 adcxq %rax,%rbp adoxq %rbp,%rax xorq %r9,%r9 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r10 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 40+128(%rcx),%rdi,%rbp movq 24(%rbx),%rdx adcxq %rdi,%r14 adoxq %rbp,%r15 adcxq %r8,%r15 adoxq %r8,%rax adcxq %r8,%rax movq %r10,%r9 imulq 8(%rsp),%r10 xorq %r8,%r8 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 40+128(%rsi),%rdi,%rbp movq %r10,%rdx adoxq %rdi,%rax adcxq %r8,%rbp adoxq %rbp,%r8 xorq %r10,%r10 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r11 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 40+128(%rcx),%rdi,%rbp movq 32(%rbx),%rdx adcxq %rdi,%r15 adoxq %rbp,%rax adcxq %r9,%rax adoxq %r9,%r8 adcxq %r9,%r8 movq %r11,%r10 imulq 8(%rsp),%r11 xorq %r9,%r9 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 40+128(%rsi),%rdi,%rbp movq %r11,%rdx adoxq %rdi,%r8 adcxq %r9,%rbp adoxq %rbp,%r9 xorq %r11,%r11 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r12 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 40+128(%rcx),%rdi,%rbp movq 40(%rbx),%rdx adcxq %rdi,%rax adoxq %rbp,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcxq %r10,%r9 movq %r12,%r11 imulq 8(%rsp),%r12 xorq %r10,%r10 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r8 adcxq %rbp,%r9 mulxq 40+128(%rsi),%rdi,%rbp movq %r12,%rdx adoxq %rdi,%r9 adcxq %r10,%rbp adoxq %rbp,%r10 xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r13 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 40+128(%rcx),%rdi,%rbp movq %r13,%rdx adcxq %rdi,%r8 adoxq %rbp,%r9 adcxq %r11,%r9 adoxq %r11,%r10 adcxq %r11,%r10 imulq 8(%rsp),%rdx movq 24(%rsp),%rbx xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r9 mulxq 40+128(%rcx),%rdi,%rbp movq %r14,%rdx adcxq %rdi,%r9 adoxq %rbp,%r10 adcq $0,%r10 movq %r8,%r12 movq %r14,0(%rbx) movq %r15,8(%rbx) movq %rax,16(%rbx) movq %r9,%rdi movq %r8,24(%rbx) movq %r9,32(%rbx) movq %r10,40(%rbx) movq %r10,%rbp #ifdef __SGX_LVI_HARDENING__ popq %rsi lfence jmpq *%rsi ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred .globl sqrx_mont_382x .hidden sqrx_mont_382x .type sqrx_mont_382x,@function .align 32 sqrx_mont_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa sqr_mont_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,16(%rsp) movq %rsi,24(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rdx adcq 72(%rsi),%r11 movq %r12,%rbx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rdx sbbq 80(%rsi),%rbx sbbq 88(%rsi),%rbp sbbq %rdi,%rdi movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq %r14,32+48(%rsp) movq %r15,32+56(%rsp) movq %rax,32+64(%rsp) movq %rdx,32+72(%rsp) movq %rbx,32+80(%rsp) movq %rbp,32+88(%rsp) movq %rdi,32+96(%rsp) leaq 48(%rsi),%rbx movq 48(%rsi),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_383_nonred addq %rdx,%rdx adcq %r15,%r15 adcq %rax,%rax adcq %r12,%r12 adcq %rdi,%rdi adcq %rbp,%rbp movq %rdx,48(%rbx) movq %r15,56(%rbx) movq %rax,64(%rbx) movq %r12,72(%rbx) movq %rdi,80(%rbx) movq %rbp,88(%rbx) leaq 32-128(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rdx movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%rax movq 32+24(%rsp),%r12 movq 32+32(%rsp),%rdi movq 32+40(%rsp),%rbp mulxq %r14,%r8,%r9 call __mulx_mont_383_nonred movq 32+96(%rsp),%r14 leaq 128(%rcx),%rcx movq 32+0(%rsp),%r8 andq %r14,%r8 movq 32+8(%rsp),%r9 andq %r14,%r9 movq 32+16(%rsp),%r10 andq %r14,%r10 movq 32+24(%rsp),%r11 andq %r14,%r11 movq 32+32(%rsp),%r13 andq %r14,%r13 andq 32+40(%rsp),%r14 subq %r8,%rdx movq 0(%rcx),%r8 sbbq %r9,%r15 movq 8(%rcx),%r9 sbbq %r10,%rax movq 16(%rcx),%r10 sbbq %r11,%r12 movq 24(%rcx),%r11 sbbq %r13,%rdi movq 32(%rcx),%r13 sbbq %r14,%rbp sbbq %r14,%r14 andq %r14,%r8 andq %r14,%r9 andq %r14,%r10 andq %r14,%r11 andq %r14,%r13 andq 40(%rcx),%r14 addq %r8,%rdx adcq %r9,%r15 adcq %r10,%rax adcq %r11,%r12 adcq %r13,%rdi adcq %r14,%rbp movq %rdx,0(%rbx) movq %r15,8(%rbx) movq %rax,16(%rbx) movq %r12,24(%rbx) movq %rdi,32(%rbx) movq %rbp,40(%rbx) leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size sqrx_mont_382x,.-sqrx_mont_382x .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/sha256-armv8.S ================================================ #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif // // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 // // ==================================================================== // Written by Andy Polyakov, @dot-asm, initially for the OpenSSL // project. // ==================================================================== // // sha256_block procedure for ARMv8. // // This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. .comm __blst_platform_cap,4 .text .align 6 .type .LK256,%object .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 .align 2 .globl blst_sha256_block_armv8 .hidden blst_sha256_block_armv8 .type blst_sha256_block_armv8,%function .align 6 blst_sha256_block_armv8: hint #34 .Lv8_entry: stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] adr x3,.LK256 .Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 ld1 {v16.4s},[x3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,.Loop_hw st1 {v0.4s,v1.4s},[x0] ldr x29,[sp],#2*__SIZEOF_POINTER__ ret .size blst_sha256_block_armv8,.-blst_sha256_block_armv8 .globl blst_sha256_block_data_order .hidden blst_sha256_block_data_order .type blst_sha256_block_data_order,%function .align 4 blst_sha256_block_data_order: hint #34 adrp x16,__blst_platform_cap ldr w16,[x16,#:lo12:__blst_platform_cap] tst w16,#1 b.ne .Lv8_entry stp x29, x30, [sp, #-2*__SIZEOF_POINTER__]! mov x29, sp sub sp,sp,#16*4 adr x16,.LK256 add x2,x1,x2,lsl#6 // len to point at the end of inp ld1 {v0.16b},[x1], #16 ld1 {v1.16b},[x1], #16 ld1 {v2.16b},[x1], #16 ld1 {v3.16b},[x1], #16 ld1 {v4.4s},[x16], #16 ld1 {v5.4s},[x16], #16 ld1 {v6.4s},[x16], #16 ld1 {v7.4s},[x16], #16 rev32 v0.16b,v0.16b // yes, even on rev32 v1.16b,v1.16b // big-endian rev32 v2.16b,v2.16b rev32 v3.16b,v3.16b mov x17,sp add v4.4s,v4.4s,v0.4s add v5.4s,v5.4s,v1.4s add v6.4s,v6.4s,v2.4s st1 {v4.4s,v5.4s},[x17], #32 add v7.4s,v7.4s,v3.4s st1 {v6.4s,v7.4s},[x17] sub x17,x17,#32 ldp w3,w4,[x0] ldp w5,w6,[x0,#8] ldp w7,w8,[x0,#16] ldp w9,w10,[x0,#24] ldr w12,[sp,#0] mov w13,wzr eor w14,w4,w5 mov w15,wzr b .L_00_48 .align 4 .L_00_48: ext v4.16b,v0.16b,v1.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v2.16b,v3.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v3.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v0.4s,v0.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v0.4s,v0.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v0.4s,v0.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v0.4s,#17 orr w12,w12,w15 ushr v19.4s,v0.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v0.4s,#15 add w8,w8,w12 ushr v17.4s,v0.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v0.4s,#13 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v0.4s,v0.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v0.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext v4.16b,v1.16b,v2.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v3.16b,v0.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v0.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v1.4s,v1.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v1.4s,v1.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v1.4s,v1.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v1.4s,#17 orr w12,w12,w15 ushr v19.4s,v1.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v1.4s,#15 add w4,w4,w12 ushr v17.4s,v1.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v1.4s,#13 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v1.4s,v1.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v1.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 ext v4.16b,v2.16b,v3.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v0.16b,v1.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v1.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v2.4s,v2.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v2.4s,v2.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v2.4s,v2.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v2.4s,#17 orr w12,w12,w15 ushr v19.4s,v2.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v2.4s,#15 add w8,w8,w12 ushr v17.4s,v2.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v2.4s,#13 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v2.4s,v2.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v2.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext v4.16b,v3.16b,v0.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v1.16b,v2.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v2.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v3.4s,v3.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v3.4s,v3.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v3.4s,v3.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v3.4s,#17 orr w12,w12,w15 ushr v19.4s,v3.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v3.4s,#15 add w4,w4,w12 ushr v17.4s,v3.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v3.4s,#13 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v3.4s,v3.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v3.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[x16] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 cmp w12,#0 // check for K256 terminator ldr w12,[sp,#0] sub x17,x17,#64 bne .L_00_48 sub x16,x16,#256 cmp x1,x2 mov x17, #-64 csel x17, x17, xzr, eq add x1,x1,x17 mov x17,sp add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v0.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v0.16b,v0.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v0.4s add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v1.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v1.16b,v1.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v1.4s add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v2.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v2.16b,v2.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v2.4s add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v3.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v3.16b,v3.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v3.4s add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w3,w3,w15 // h+=Sigma0(a) from the past ldp w11,w12,[x0,#0] add w3,w3,w13 // h+=Maj(a,b,c) from the past ldp w13,w14,[x0,#8] add w3,w3,w11 // accumulate add w4,w4,w12 ldp w11,w12,[x0,#16] add w5,w5,w13 add w6,w6,w14 ldp w13,w14,[x0,#24] add w7,w7,w11 add w8,w8,w12 ldr w12,[sp,#0] stp w3,w4,[x0,#0] add w9,w9,w13 mov w13,wzr stp w5,w6,[x0,#8] add w10,w10,w14 stp w7,w8,[x0,#16] eor w14,w4,w5 stp w9,w10,[x0,#24] mov w15,wzr mov x17,sp b.ne .L_00_48 ldr x29,[x29] add sp,sp,#16*4+2*__SIZEOF_POINTER__ ret .size blst_sha256_block_data_order,.-blst_sha256_block_data_order .globl blst_sha256_emit .hidden blst_sha256_emit .type blst_sha256_emit,%function .align 4 blst_sha256_emit: hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] #ifndef __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 #endif str w4,[x0,#4] lsr x4,x4,#32 str w5,[x0,#12] lsr x5,x5,#32 str w6,[x0,#20] lsr x6,x6,#32 str w7,[x0,#28] lsr x7,x7,#32 str w4,[x0,#0] str w5,[x0,#8] str w6,[x0,#16] str w7,[x0,#24] ret .size blst_sha256_emit,.-blst_sha256_emit .globl blst_sha256_bcopy .hidden blst_sha256_bcopy .type blst_sha256_bcopy,%function .align 4 blst_sha256_bcopy: hint #34 .Loop_bcopy: ldrb w3,[x1],#1 sub x2,x2,#1 strb w3,[x0],#1 cbnz x2,.Loop_bcopy ret .size blst_sha256_bcopy,.-blst_sha256_bcopy .globl blst_sha256_hcopy .hidden blst_sha256_hcopy .type blst_sha256_hcopy,%function .align 4 blst_sha256_hcopy: hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] stp x4,x5,[x0] stp x6,x7,[x0,#16] ret .size blst_sha256_hcopy,.-blst_sha256_hcopy #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ================================================ FILE: build/elf/sha256-portable-x86_64.s ================================================ .comm __blst_platform_cap,4 .text .globl blst_sha256_block_data_order .type blst_sha256_block_data_order,@function .align 16 blst_sha256_block_data_order: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp #ifdef __BLST_PORTABLE__ testl $2,__blst_platform_cap(%rip) jnz .Lblst_sha256_block_data_order$2 #endif pushq %rbx .cfi_offset %rbx,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $64+24,%rsp .cfi_def_cfa %rsp,144 leaq (%rsi,%rdx,4),%rdx movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop .align 16 .Lloop: movl %ebx,%edi leaq K256(%rip),%rbp xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 0(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 4(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 8(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 12(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 16(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 20(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 24(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 28(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 32(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 36(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 40(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 44(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 48(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 52(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 56(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 60(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movl 4(%rsp),%r13d movl 56(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 64(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d movl 8(%rsp),%r13d movl 60(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 68(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d movl 12(%rsp),%r13d movl 0(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 72(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d movl 16(%rsp),%r13d movl 4(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 76(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d movl 20(%rsp),%r13d movl 8(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 80(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx movl 24(%rsp),%r13d movl 12(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 84(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx movl 28(%rsp),%r13d movl 16(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 88(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx movl 32(%rsp),%r13d movl 20(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 92(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax movl 36(%rsp),%r13d movl 24(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 96(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d movl 40(%rsp),%r13d movl 28(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 100(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d movl 44(%rsp),%r13d movl 32(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 104(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d movl 48(%rsp),%r13d movl 36(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 108(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d movl 52(%rsp),%r13d movl 40(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 112(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx movl 56(%rsp),%r13d movl 44(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 116(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx movl 60(%rsp),%r13d movl 48(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 120(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx movl 0(%rsp),%r13d movl 52(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 124(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 64(%rbp),%rbp cmpb $0x19,3(%rbp) jnz .Lrounds_16_xx movq 64+0(%rsp),%rdi addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop leaq 64+24+48(%rsp),%r11 .cfi_def_cfa %r11,8 movq 64+24(%rsp),%r15 movq -40(%r11),%r14 movq -32(%r11),%r13 movq -24(%r11),%r12 movq -16(%r11),%rbx movq -8(%r11),%rbp .cfi_restore %r12 .cfi_restore %r13 .cfi_restore %r14 .cfi_restore %r15 .cfi_restore %rbp .cfi_restore %rbx leaq (%r11),%rsp #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_block_data_order,.-blst_sha256_block_data_order #ifndef __BLST_PORTABLE__ .section .rodata .align 64 .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .globl blst_sha256_emit .hidden blst_sha256_emit .type blst_sha256_emit,@function .align 16 blst_sha256_emit: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 bswapq %r8 movq 24(%rsi),%r11 bswapq %r9 movl %r8d,4(%rdi) bswapq %r10 movl %r9d,12(%rdi) bswapq %r11 movl %r10d,20(%rdi) shrq $32,%r8 movl %r11d,28(%rdi) shrq $32,%r9 movl %r8d,0(%rdi) shrq $32,%r10 movl %r9d,8(%rdi) shrq $32,%r11 movl %r10d,16(%rdi) movl %r11d,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_emit,.-blst_sha256_emit .globl blst_sha256_bcopy .hidden blst_sha256_bcopy .type blst_sha256_bcopy,@function .align 16 blst_sha256_bcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa subq %rsi,%rdi .Loop_bcopy: movzbl (%rsi),%eax leaq 1(%rsi),%rsi movb %al,-1(%rdi,%rsi,1) decq %rdx jnz .Loop_bcopy #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_bcopy,.-blst_sha256_bcopy .globl blst_sha256_hcopy .hidden blst_sha256_hcopy .type blst_sha256_hcopy,@function .align 16 blst_sha256_hcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_hcopy,.-blst_sha256_hcopy #endif .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/elf/sha256-x86_64.s ================================================ .comm __blst_platform_cap,4 .section .rodata .align 64 .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .text .globl blst_sha256_block_data_order_shaext .hidden blst_sha256_block_data_order_shaext .type blst_sha256_block_data_order_shaext,@function .align 64 blst_sha256_block_data_order_shaext: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp .Lblst_sha256_block_data_order$2: #ifdef __SGX_LVI_HARDENING__ lfence #endif leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa 256-128(%rcx),%xmm7 pshufd $0x1b,%xmm1,%xmm0 pshufd $0xb1,%xmm1,%xmm1 pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 jmp .Loop_shaext .align 16 .Loop_shaext: movdqu (%rsi),%xmm3 movdqu 16(%rsi),%xmm4 movdqu 32(%rsi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%rsi),%xmm6 movdqa 0-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 movdqa 16-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 movdqa 32-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 48-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 64-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 80-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 96-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 112-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 128-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 144-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 160-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 176-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 192-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 208-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 224-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 movdqa 240-128(%rcx),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 paddd %xmm10,%xmm2 paddd %xmm9,%xmm1 jnz .Loop_shaext pshufd $0xb1,%xmm2,%xmm2 pshufd $0x1b,%xmm1,%xmm7 pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) .cfi_def_cfa_register %rsp popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext .globl blst_sha256_block_data_order .hidden blst_sha256_block_data_order .type blst_sha256_block_data_order,@function .align 64 blst_sha256_block_data_order: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp #ifndef __SGX_LVI_HARDENING__ testl $2,__blst_platform_cap(%rip) jnz .Lblst_sha256_block_data_order$2 #endif pushq %rbx .cfi_offset %rbx,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $24,%rsp leaq (%rsi,%rdx,4),%rdx movq %rdi,-64(%rbp) movq %rdx,-48(%rbp) leaq -64(%rsp),%rsp #ifdef __SGX_LVI_HARDENING__ lfence #endif movl 0(%rdi),%eax andq $-64,%rsp movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: movdqa K256+256(%rip),%xmm7 movq %rsi,-56(%rbp) movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 .byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rsi .byte 102,15,56,0,207 movdqa 0(%rsi),%xmm4 movdqa 16(%rsi),%xmm5 .byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 32(%rsi),%xmm6 .byte 102,15,56,0,223 movdqa 48(%rsi),%xmm7 paddd %xmm1,%xmm5 paddd %xmm2,%xmm6 paddd %xmm3,%xmm7 movdqa %xmm4,0(%rsp) movl %eax,%r14d movdqa %xmm5,16(%rsp) movl %ebx,%edi movdqa %xmm6,32(%rsp) xorl %ecx,%edi movdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp .Lssse3_00_47 .align 16 .Lssse3_00_47: subq $-64,%rsi rorl $14,%r13d movdqa %xmm1,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm3,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,224,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,250,4 addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm0 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm3,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 4(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm0 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm0 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm0,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 0(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm0 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm0,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,0(%rsp) rorl $14,%r13d movdqa %xmm2,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm0,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,225,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,251,4 addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm1 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm0,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 20(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm1 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm1 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm1,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 16(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm1 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm1,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,16(%rsp) rorl $14,%r13d movdqa %xmm3,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm1,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,226,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,248,4 addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm2 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm1,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 36(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm2 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm2 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm2,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 32(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm2 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm2,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,32(%rsp) rorl $14,%r13d movdqa %xmm0,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm2,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,227,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,249,4 addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm3 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm2,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 52(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm3 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm3 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm3,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 48(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm3 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm3,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,48(%rsp) cmpb $0,67(%rsi) jne .Lssse3_00_47 rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq -64(%rbp),%rdi movl %r14d,%eax movq -56(%rbp),%rsi #ifdef __SGX_LVI_HARDENING__ lfence #endif addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d leaq 64(%rsi),%rsi cmpq -48(%rbp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop_ssse3 xorps %xmm0,%xmm0 movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) movaps %xmm0,48(%rsp) movq -40(%rbp),%r15 movq -32(%rbp),%r14 movq -24(%rbp),%r13 movq -16(%rbp),%r12 movq -8(%rbp),%rbx movq %rbp,%rsp .cfi_def_cfa_register %rsp popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp .cfi_restore %r12 .cfi_restore %r13 .cfi_restore %r14 .cfi_restore %r15 .cfi_restore %rbx #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_block_data_order,.-blst_sha256_block_data_order .globl blst_sha256_emit .hidden blst_sha256_emit .type blst_sha256_emit,@function .align 16 blst_sha256_emit: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 bswapq %r8 movq 24(%rsi),%r11 bswapq %r9 movl %r8d,4(%rdi) bswapq %r10 movl %r9d,12(%rdi) bswapq %r11 movl %r10d,20(%rdi) shrq $32,%r8 movl %r11d,28(%rdi) shrq $32,%r9 movl %r8d,0(%rdi) shrq $32,%r10 movl %r9d,8(%rdi) shrq $32,%r11 movl %r10d,16(%rdi) movl %r11d,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_emit,.-blst_sha256_emit .globl blst_sha256_bcopy .hidden blst_sha256_bcopy .type blst_sha256_bcopy,@function .align 16 blst_sha256_bcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif subq %rsi,%rdi .Loop_bcopy: movzbl (%rsi),%eax leaq 1(%rsi),%rsi movb %al,-1(%rdi,%rsi,1) decq %rdx jnz .Loop_bcopy #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_bcopy,.-blst_sha256_bcopy .globl blst_sha256_hcopy .hidden blst_sha256_hcopy .type blst_sha256_hcopy,@function .align 16 blst_sha256_hcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .size blst_sha256_hcopy,.-blst_sha256_hcopy .section .note.GNU-stack,"",@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align 8 2: #endif ================================================ FILE: build/mach-o/add_mod_256-armv8.S ================================================ .text .globl _add_mod_256 .private_extern _add_mod_256 .align 5 _add_mod_256: hint #34 ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] adds x8,x8,x12 ldp x14,x15,[x2,#16] adcs x9,x9,x13 ldp x4,x5,[x3] adcs x10,x10,x14 ldp x6,x7,[x3,#16] adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[x0] csel x11,x11,x2,lo stp x10,x11,[x0,#16] ret .globl _mul_by_3_mod_256 .private_extern _mul_by_3_mod_256 .align 5 _mul_by_3_mod_256: hint #34 ldp x12,x13,[x1] ldp x14,x15,[x1,#16] adds x8,x12,x12 ldp x4,x5,[x2] adcs x9,x13,x13 ldp x6,x7,[x2,#16] adcs x10,x14,x14 adcs x11,x15,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo adds x8,x8,x12 adcs x9,x9,x13 adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo stp x8,x9,[x0] csel x11,x11,x2,lo stp x10,x11,[x0,#16] ret .globl _lshift_mod_256 .private_extern _lshift_mod_256 .align 5 _lshift_mod_256: hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] Loop_lshift_mod_256: adds x8,x8,x8 sub x2,x2,#1 adcs x9,x9,x9 adcs x10,x10,x10 adcs x11,x11,x11 adc x3,xzr,xzr subs x12,x8,x4 sbcs x13,x9,x5 sbcs x14,x10,x6 sbcs x15,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x12,lo csel x9,x9,x13,lo csel x10,x10,x14,lo csel x11,x11,x15,lo cbnz x2,Loop_lshift_mod_256 stp x8,x9,[x0] stp x10,x11,[x0,#16] ret .globl _rshift_mod_256 .private_extern _rshift_mod_256 .align 5 _rshift_mod_256: hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] Loop_rshift: adds x12,x8,x4 sub x2,x2,#1 adcs x13,x9,x5 adcs x14,x10,x6 adcs x15,x11,x7 adc x3,xzr,xzr tst x8,#1 csel x12,x12,x8,ne csel x13,x13,x9,ne csel x14,x14,x10,ne csel x15,x15,x11,ne csel x3,x3,xzr,ne extr x8,x13,x12,#1 extr x9,x14,x13,#1 extr x10,x15,x14,#1 extr x11,x3,x15,#1 cbnz x2,Loop_rshift stp x8,x9,[x0] stp x10,x11,[x0,#16] ret .globl _cneg_mod_256 .private_extern _cneg_mod_256 .align 5 _cneg_mod_256: ldp x8,x9,[x1] ldp x4,x5,[x3] ldp x10,x11,[x1,#16] subs x12,x4,x8 ldp x6,x7,[x3,#16] orr x4,x8,x9 sbcs x13,x5,x9 orr x5,x10,x11 sbcs x14,x6,x10 orr x3,x4,x5 sbc x15,x7,x11 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x8,x8,x12,eq csel x9,x9,x13,eq csel x10,x10,x14,eq stp x8,x9,[x0] csel x11,x11,x15,eq stp x10,x11,[x0,#16] ret .globl _sub_mod_256 .private_extern _sub_mod_256 .align 5 _sub_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] subs x8,x8,x12 ldp x14,x15,[x2,#16] sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 stp x8,x9,[x0] adc x11,x11,x7 stp x10,x11,[x0,#16] ret .globl _check_mod_256 .private_extern _check_mod_256 .align 5 _check_mod_256: ldp x8,x9,[x0] ldp x10,x11,[x0,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif subs xzr,x8,x4 sbcs xzr,x9,x5 orr x8,x8,x9 sbcs xzr,x10,x6 orr x8,x8,x10 sbcs xzr,x11,x7 orr x8,x8,x11 sbc x1,xzr,xzr cmp x8,#0 mov x0,#1 csel x0,x0,xzr,ne and x0,x0,x1 ret .globl _add_n_check_mod_256 .private_extern _add_n_check_mod_256 .align 5 _add_n_check_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif adds x8,x8,x12 ldp x4,x5,[x3] adcs x9,x9,x13 ldp x6,x7,[x3,#16] adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csel x8,x8,x16,lo csel x9,x9,x17,lo csel x10,x10,x1,lo csel x11,x11,x2,lo orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret .globl _sub_n_check_mod_256 .private_extern _sub_n_check_mod_256 .align 5 _sub_n_check_mod_256: ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 #endif subs x8,x8,x12 sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 adc x11,x11,x7 orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 #ifdef __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 #endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 csel x0, x17, xzr, ne ret ================================================ FILE: build/mach-o/add_mod_256-x86_64.s ================================================ .text .globl _add_mod_256 .private_extern _add_mod_256 .p2align 5 _add_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 L$oaded_a_add_mod_256: addq 0(%rdx),%r8 adcq 8(%rdx),%r9 movq %r8,%rax adcq 16(%rdx),%r10 movq %r9,%rsi adcq 24(%rdx),%r11 sbbq %rdx,%rdx movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%rdx cmovcq %rax,%r8 cmovcq %rsi,%r9 movq %r8,0(%rdi) cmovcq %rbx,%r10 movq %r9,8(%rdi) cmovcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_by_3_mod_256 .private_extern _mul_by_3_mod_256 .p2align 5 _mul_by_3_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %rsi,%rdx movq 24(%rsi),%r11 call __lshift_mod_256 movq 0(%rsp),%r12 .cfi_restore %r12 jmp L$oaded_a_add_mod_256 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __lshift_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq %r8,%r8 adcq %r9,%r9 movq %r8,%rax adcq %r10,%r10 movq %r9,%rsi adcq %r11,%r11 sbbq %r12,%r12 movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%r12 cmovcq %rax,%r8 cmovcq %rsi,%r9 cmovcq %rbx,%r10 cmovcq %rbp,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rax lfence jmpq *%rax ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _lshift_mod_256 .private_extern _lshift_mod_256 .p2align 5 _lshift_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 L$oop_lshift_mod_256: call __lshift_mod_256 decl %edx jnz L$oop_lshift_mod_256 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r12 .cfi_restore %r12 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _rshift_mod_256 .private_extern _rshift_mod_256 .p2align 5 _rshift_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rbp movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 L$oop_rshift_mod_256: movq %rbp,%r8 andq $1,%rbp movq 0(%rcx),%rax negq %rbp movq 8(%rcx),%rsi movq 16(%rcx),%rbx andq %rbp,%rax andq %rbp,%rsi andq %rbp,%rbx andq 24(%rcx),%rbp addq %rax,%r8 adcq %rsi,%r9 adcq %rbx,%r10 adcq %rbp,%r11 sbbq %rax,%rax shrq $1,%r8 movq %r9,%rbp shrq $1,%r9 movq %r10,%rbx shrq $1,%r10 movq %r11,%rsi shrq $1,%r11 shlq $63,%rbp shlq $63,%rbx orq %r8,%rbp shlq $63,%rsi orq %rbx,%r9 shlq $63,%rax orq %rsi,%r10 orq %rax,%r11 decl %edx jnz L$oop_rshift_mod_256 movq %rbp,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _cneg_mod_256 .private_extern _cneg_mod_256 .p2align 5 _cneg_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r12 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %r12,%r8 movq 24(%rsi),%r11 orq %r9,%r12 orq %r10,%r12 orq %r11,%r12 movq $-1,%rbp movq 0(%rcx),%rax cmovnzq %rbp,%r12 movq 8(%rcx),%rsi movq 16(%rcx),%rbx andq %r12,%rax movq 24(%rcx),%rbp andq %r12,%rsi andq %r12,%rbx andq %r12,%rbp subq %r8,%rax sbbq %r9,%rsi sbbq %r10,%rbx sbbq %r11,%rbp orq %rdx,%rdx cmovzq %r8,%rax cmovzq %r9,%rsi movq %rax,0(%rdi) cmovzq %r10,%rbx movq %rsi,8(%rdi) cmovzq %r11,%rbp movq %rbx,16(%rdi) movq %rbp,24(%rdi) movq 0(%rsp),%r12 .cfi_restore %r12 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sub_mod_256 .private_extern _sub_mod_256 .p2align 5 _sub_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 subq 0(%rdx),%r8 movq 0(%rcx),%rax sbbq 8(%rdx),%r9 movq 8(%rcx),%rsi sbbq 16(%rdx),%r10 movq 16(%rcx),%rbx sbbq 24(%rdx),%r11 movq 24(%rcx),%rbp sbbq %rdx,%rdx andq %rdx,%rax andq %rdx,%rsi andq %rdx,%rbx andq %rdx,%rbp addq %rax,%r8 adcq %rsi,%r9 movq %r8,0(%rdi) adcq %rbx,%r10 movq %r9,8(%rdi) adcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _check_mod_256 .private_extern _check_mod_256 .p2align 5 _check_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%rax movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq %rax,%r8 orq %r9,%rax orq %r10,%rax orq %r11,%rax subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq %rsi,%rsi movq $1,%rdx cmpq $0,%rax cmovneq %rdx,%rax andq %rsi,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _add_n_check_mod_256 .private_extern _add_n_check_mod_256 .p2align 5 _add_n_check_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 movq %r8,%rax adcq 16(%rdx),%r10 movq %r9,%rsi adcq 24(%rdx),%r11 sbbq %rdx,%rdx movq %r10,%rbx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 sbbq 16(%rcx),%r10 movq %r11,%rbp sbbq 24(%rcx),%r11 sbbq $0,%rdx cmovcq %rax,%r8 cmovcq %rsi,%r9 movq %r8,0(%rdi) cmovcq %rbx,%r10 movq %r9,8(%rdi) cmovcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) orq %r9,%r8 orq %r11,%r10 orq %r10,%r8 movq $1,%rax cmovzq %r8,%rax movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sub_n_check_mod_256 .private_extern _sub_n_check_mod_256 .p2align 5 _sub_n_check_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 subq 0(%rdx),%r8 movq 0(%rcx),%rax sbbq 8(%rdx),%r9 movq 8(%rcx),%rsi sbbq 16(%rdx),%r10 movq 16(%rcx),%rbx sbbq 24(%rdx),%r11 movq 24(%rcx),%rbp sbbq %rdx,%rdx andq %rdx,%rax andq %rdx,%rsi andq %rdx,%rbx andq %rdx,%rbp addq %rax,%r8 adcq %rsi,%r9 movq %r8,0(%rdi) adcq %rbx,%r10 movq %r9,8(%rdi) adcq %rbp,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) orq %r9,%r8 orq %r11,%r10 orq %r10,%r8 movq $1,%rax cmovzq %r8,%rax movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/add_mod_384-armv8.S ================================================ .text .globl _add_mod_384 .private_extern _add_mod_384 .align 5 _add_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .align 5 __add_mod_384: ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] __add_mod_384_ab_are_loaded: adds x10,x10,x16 adcs x11,x11,x17 adcs x12,x12,x19 adcs x13,x13,x20 adcs x14,x14,x21 adcs x15,x15,x22 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .globl _add_mod_384x .private_extern _add_mod_384x .align 5 _add_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _rshift_mod_384 .private_extern _rshift_mod_384 .align 5 _rshift_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] Loop_rshift_mod_384: sub x2,x2,#1 bl __rshift_mod_384 cbnz x2,Loop_rshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .align 5 __rshift_mod_384: sbfx x22,x10,#0,#1 and x16,x22,x4 and x17,x22,x5 adds x10,x10,x16 and x19,x22,x6 adcs x11,x11,x17 and x20,x22,x7 adcs x12,x12,x19 and x21,x22,x8 adcs x13,x13,x20 and x22,x22,x9 adcs x14,x14,x21 extr x10,x11,x10,#1 // a[0:5] >>= 1 adcs x15,x15,x22 extr x11,x12,x11,#1 adc x22,xzr,xzr extr x12,x13,x12,#1 extr x13,x14,x13,#1 extr x14,x15,x14,#1 extr x15,x22,x15,#1 ret .globl _div_by_2_mod_384 .private_extern _div_by_2_mod_384 .align 5 _div_by_2_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __rshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _lshift_mod_384 .private_extern _lshift_mod_384 .align 5 _lshift_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] Loop_lshift_mod_384: sub x2,x2,#1 bl __lshift_mod_384 cbnz x2,Loop_lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .align 5 __lshift_mod_384: adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csel x10,x10,x16,lo csel x11,x11,x17,lo csel x12,x12,x19,lo csel x13,x13,x20,lo csel x14,x14,x21,lo csel x15,x15,x22,lo ret .globl _mul_by_3_mod_384 .private_extern _mul_by_3_mod_384 .align 5 _mul_by_3_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _mul_by_8_mod_384 .private_extern _mul_by_8_mod_384 .align 5 _mul_by_8_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _mul_by_3_mod_384x .private_extern _mul_by_3_mod_384x .align 5 _mul_by_3_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 ldp x16,x17,[x1,#48] ldp x19,x20,[x1,#64] ldp x21,x22,[x1,#80] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _mul_by_8_mod_384x .private_extern _mul_by_8_mod_384x .align 5 _mul_by_8_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _cneg_mod_384 .private_extern _cneg_mod_384 .align 5 _cneg_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x4,x5,[x3] ldp x12,x13,[x1,#16] ldp x6,x7,[x3,#16] subs x16,x4,x10 ldp x14,x15,[x1,#32] ldp x8,x9,[x3,#32] orr x3,x10,x11 sbcs x17,x5,x11 orr x3,x3,x12 sbcs x19,x6,x12 orr x3,x3,x13 sbcs x20,x7,x13 orr x3,x3,x14 sbcs x21,x8,x14 orr x3,x3,x15 sbc x22,x9,x15 cmp x3,#0 csetm x3,ne ands x2,x2,x3 csel x10,x10,x16,eq csel x11,x11,x17,eq csel x12,x12,x19,eq csel x13,x13,x20,eq stp x10,x11,[x0] csel x14,x14,x21,eq stp x12,x13,[x0,#16] csel x15,x15,x22,eq stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _sub_mod_384 .private_extern _sub_mod_384 .align 5 _sub_mod_384: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .align 5 __sub_mod_384: ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] subs x10,x10,x16 sbcs x11,x11,x17 sbcs x12,x12,x19 sbcs x13,x13,x20 sbcs x14,x14,x21 sbcs x15,x15,x22 sbc x3,xzr,xzr and x16,x4,x3 and x17,x5,x3 adds x10,x10,x16 and x19,x6,x3 adcs x11,x11,x17 and x20,x7,x3 adcs x12,x12,x19 and x21,x8,x3 adcs x13,x13,x20 and x22,x9,x3 adcs x14,x14,x21 adc x15,x15,x22 ret .globl _sub_mod_384x .private_extern _sub_mod_384x .align 5 _sub_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _mul_by_1_plus_i_mod_384x .private_extern _mul_by_1_plus_i_mod_384x .align 5 _mul_by_1_plus_i_mod_384x: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] add x2,x1,#48 bl __sub_mod_384 // a->re - a->im ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __add_mod_384_ab_are_loaded // a->re + a->im ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _sgn0_pty_mod_384 .private_extern _sgn0_pty_mod_384 .align 5 _sgn0_pty_mod_384: hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x0,x10,#1 adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x3,x3,xzr mvn x3,x3 and x3,x3,#2 orr x0,x0,x3 ret .globl _sgn0_pty_mod_384x .private_extern _sgn0_pty_mod_384x .align 5 _sgn0_pty_mod_384x: hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x2,x10,#1 orr x3,x10,x11 adds x10,x10,x10 orr x3,x3,x12 adcs x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr ldp x10,x11,[x0,#48] ldp x12,x13,[x0,#64] ldp x14,x15,[x0,#80] mvn x16,x16 and x16,x16,#2 orr x2,x2,x16 and x0,x10,#1 orr x1,x10,x11 adds x10,x10,x10 orr x1,x1,x12 adcs x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr mvn x16,x16 and x16,x16,#2 orr x0,x0,x16 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ret .globl _vec_select_32 .private_extern _vec_select_32 .align 5 _vec_select_32: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d}, [x1] cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d}, [x2] bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b st1 {v0.2d, v1.2d}, [x0] ret .globl _vec_select_48 .private_extern _vec_select_48 .align 5 _vec_select_48: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret .globl _vec_select_96 .private_extern _vec_select_96 .align 5 _vec_select_96: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .globl _vec_select_192 .private_extern _vec_select_192 .align 5 _vec_select_192: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .globl _vec_select_144 .private_extern _vec_select_144 .align 5 _vec_select_144: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret .globl _vec_select_288 .private_extern _vec_select_288 .align 5 _vec_select_288: hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret .globl _vec_prefetch .private_extern _vec_prefetch .align 5 _vec_prefetch: hint #34 add x1, x1, x0 sub x1, x1, #1 mov x2, #64 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi csel x2, xzr, x2, hi prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 csel x0, x1, x0, hi prfm pldl1keep, [x0] ret .globl _vec_is_zero_16x .private_extern _vec_is_zero_16x .align 5 _vec_is_zero_16x: hint #34 ld1 {v0.2d}, [x0], #16 lsr x1, x1, #4 sub x1, x1, #1 cbz x1, Loop_is_zero_done Loop_is_zero: ld1 {v1.2d}, [x0], #16 orr v0.16b, v0.16b, v1.16b sub x1, x1, #1 cbnz x1, Loop_is_zero Loop_is_zero_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .globl _vec_is_equal_16x .private_extern _vec_is_equal_16x .align 5 _vec_is_equal_16x: hint #34 ld1 {v0.2d}, [x0], #16 ld1 {v1.2d}, [x1], #16 lsr x2, x2, #4 eor v0.16b, v0.16b, v1.16b Loop_is_equal: sub x2, x2, #1 cbz x2, Loop_is_equal_done ld1 {v1.2d}, [x0], #16 ld1 {v2.2d}, [x1], #16 eor v1.16b, v1.16b, v2.16b orr v0.16b, v0.16b, v1.16b b Loop_is_equal nop Loop_is_equal_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret ================================================ FILE: build/mach-o/add_mod_384-x86_64.s ================================================ .text .globl _add_mod_384 .private_extern _add_mod_384 .p2align 5 _add_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 call __add_mod_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __add_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __add_mod_384_a_is_loaded: addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _add_mod_384x .private_extern _add_mod_384x .p2align 5 _add_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $24,%rsp .cfi_adjust_cfa_offset 24 movq %rsi,0(%rsp) movq %rdx,8(%rsp) leaq 48(%rsi),%rsi leaq 48(%rdx),%rdx leaq 48(%rdi),%rdi call __add_mod_384 movq 0(%rsp),%rsi movq 8(%rsp),%rdx leaq -48(%rdi),%rdi call __add_mod_384 movq 24+0(%rsp),%r15 .cfi_restore %r15 movq 24+8(%rsp),%r14 .cfi_restore %r14 movq 24+16(%rsp),%r13 .cfi_restore %r13 movq 24+24(%rsp),%r12 .cfi_restore %r12 movq 24+32(%rsp),%rbx .cfi_restore %rbx movq 24+40(%rsp),%rbp .cfi_restore %rbp leaq 24+48(%rsp),%rsp .cfi_adjust_cfa_offset -24-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _rshift_mod_384 .private_extern _rshift_mod_384 .p2align 5 _rshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 L$oop_rshift_mod_384: call __rshift_mod_384 decl %edx jnz L$oop_rshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __rshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rsi movq 0(%rcx),%r14 andq %r8,%rsi movq 8(%rcx),%r15 negq %rsi movq 16(%rcx),%rax andq %rsi,%r14 movq 24(%rcx),%rbx andq %rsi,%r15 movq 32(%rcx),%rbp andq %rsi,%rax andq %rsi,%rbx andq %rsi,%rbp andq 40(%rcx),%rsi addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rax adcq %r11,%rbx adcq %r12,%rbp adcq %r13,%rsi sbbq %r13,%r13 shrq $1,%r14 movq %r15,%r8 shrq $1,%r15 movq %rax,%r9 shrq $1,%rax movq %rbx,%r10 shrq $1,%rbx movq %rbp,%r11 shrq $1,%rbp movq %rsi,%r12 shrq $1,%rsi shlq $63,%r8 shlq $63,%r9 orq %r14,%r8 shlq $63,%r10 orq %r15,%r9 shlq $63,%r11 orq %rax,%r10 shlq $63,%r12 orq %rbx,%r11 shlq $63,%r13 orq %rbp,%r12 orq %rsi,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r14 lfence jmpq *%r14 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _div_by_2_mod_384 .private_extern _div_by_2_mod_384 .p2align 5 _div_by_2_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq %rdx,%rcx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 call __rshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _lshift_mod_384 .private_extern _lshift_mod_384 .p2align 5 _lshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 L$oop_lshift_mod_384: addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 movq %r8,%r14 adcq %r11,%r11 movq %r9,%r15 adcq %r12,%r12 movq %r10,%rax adcq %r13,%r13 movq %r11,%rbx sbbq %rdi,%rdi subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdi movq (%rsp),%rdi cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 cmovcq %rbx,%r11 cmovcq %rbp,%r12 cmovcq %rsi,%r13 decl %edx jnz L$oop_lshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __lshift_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 movq %r8,%r14 adcq %r11,%r11 movq %r9,%r15 adcq %r12,%r12 movq %r10,%rax adcq %r13,%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 cmovcq %rbx,%r11 cmovcq %rbp,%r12 cmovcq %rsi,%r13 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_by_3_mod_384 .private_extern _mul_by_3_mod_384 .p2align 5 _mul_by_3_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 movq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_by_8_mod_384 .private_extern _mul_by_8_mod_384 .p2align 5 _mul_by_8_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_by_3_mod_384x .private_extern _mul_by_3_mod_384x .p2align 5 _mul_by_3_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 movq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq (%rsp),%rsi leaq 48(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rsi),%r8 movq 56(%rsi),%r9 movq 64(%rsi),%r10 movq 72(%rsi),%r11 movq 80(%rsi),%r12 movq 88(%rsi),%r13 call __lshift_mod_384 movq $48,%rdx addq (%rsp),%rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_by_8_mod_384x .private_extern _mul_by_8_mod_384x .p2align 5 _mul_by_8_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rcx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq (%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48+0(%rsi),%r8 movq 48+8(%rsi),%r9 movq 48+16(%rsi),%r10 movq 48+24(%rsi),%r11 movq 48+32(%rsi),%r12 movq 48+40(%rsi),%r13 call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 movq %r8,48+0(%rdi) movq %r9,48+8(%rdi) movq %r10,48+16(%rdi) movq %r11,48+24(%rdi) movq %r12,48+32(%rdi) movq %r13,48+40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _cneg_mod_384 .private_extern _cneg_mod_384 .p2align 5 _cneg_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdx .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %rdx,%r8 movq 24(%rsi),%r11 orq %r9,%rdx movq 32(%rsi),%r12 orq %r10,%rdx movq 40(%rsi),%r13 orq %r11,%rdx movq $-1,%rsi orq %r12,%rdx orq %r13,%rdx movq 0(%rcx),%r14 cmovnzq %rsi,%rdx movq 8(%rcx),%r15 movq 16(%rcx),%rax andq %rdx,%r14 movq 24(%rcx),%rbx andq %rdx,%r15 movq 32(%rcx),%rbp andq %rdx,%rax movq 40(%rcx),%rsi andq %rdx,%rbx movq 0(%rsp),%rcx andq %rdx,%rbp andq %rdx,%rsi subq %r8,%r14 sbbq %r9,%r15 sbbq %r10,%rax sbbq %r11,%rbx sbbq %r12,%rbp sbbq %r13,%rsi orq %rcx,%rcx cmovzq %r8,%r14 cmovzq %r9,%r15 cmovzq %r10,%rax movq %r14,0(%rdi) cmovzq %r11,%rbx movq %r15,8(%rdi) cmovzq %r12,%rbp movq %rax,16(%rdi) cmovzq %r13,%rsi movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rsi,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sub_mod_384 .private_extern _sub_mod_384 .p2align 5 _sub_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 call __sub_mod_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __sub_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sub_mod_384x .private_extern _sub_mod_384x .p2align 5 _sub_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $24,%rsp .cfi_adjust_cfa_offset 24 movq %rsi,0(%rsp) movq %rdx,8(%rsp) leaq 48(%rsi),%rsi leaq 48(%rdx),%rdx leaq 48(%rdi),%rdi call __sub_mod_384 movq 0(%rsp),%rsi movq 8(%rsp),%rdx leaq -48(%rdi),%rdi call __sub_mod_384 movq 24+0(%rsp),%r15 .cfi_restore %r15 movq 24+8(%rsp),%r14 .cfi_restore %r14 movq 24+16(%rsp),%r13 .cfi_restore %r13 movq 24+24(%rsp),%r12 .cfi_restore %r12 movq 24+32(%rsp),%rbx .cfi_restore %rbx movq 24+40(%rsp),%rbp .cfi_restore %rbp leaq 24+48(%rsp),%rsp .cfi_adjust_cfa_offset -24-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_by_1_plus_i_mod_384x .private_extern _mul_by_1_plus_i_mod_384x .p2align 5 _mul_by_1_plus_i_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $56,%rsp .cfi_adjust_cfa_offset 56 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rbx adcq 72(%rsi),%r11 movq %r12,%rcx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 movq %rdi,48(%rsp) sbbq %rdi,%rdi subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rbx sbbq 80(%rsi),%rcx sbbq 88(%rsi),%rbp sbbq %rsi,%rsi movq %r8,0(%rsp) movq 0(%rdx),%r8 movq %r9,8(%rsp) movq 8(%rdx),%r9 movq %r10,16(%rsp) movq 16(%rdx),%r10 movq %r11,24(%rsp) movq 24(%rdx),%r11 movq %r12,32(%rsp) andq %rsi,%r8 movq 32(%rdx),%r12 movq %r13,40(%rsp) andq %rsi,%r9 movq 40(%rdx),%r13 andq %rsi,%r10 andq %rsi,%r11 andq %rsi,%r12 andq %rsi,%r13 movq 48(%rsp),%rsi addq %r8,%r14 movq 0(%rsp),%r8 adcq %r9,%r15 movq 8(%rsp),%r9 adcq %r10,%rax movq 16(%rsp),%r10 adcq %r11,%rbx movq 24(%rsp),%r11 adcq %r12,%rcx movq 32(%rsp),%r12 adcq %r13,%rbp movq 40(%rsp),%r13 movq %r14,0(%rsi) movq %r8,%r14 movq %r15,8(%rsi) movq %rax,16(%rsi) movq %r9,%r15 movq %rbx,24(%rsi) movq %rcx,32(%rsi) movq %r10,%rax movq %rbp,40(%rsi) subq 0(%rdx),%r8 movq %r11,%rbx sbbq 8(%rdx),%r9 sbbq 16(%rdx),%r10 movq %r12,%rcx sbbq 24(%rdx),%r11 sbbq 32(%rdx),%r12 movq %r13,%rbp sbbq 40(%rdx),%r13 sbbq $0,%rdi cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,48(%rsi) cmovcq %rbx,%r11 movq %r9,56(%rsi) cmovcq %rcx,%r12 movq %r10,64(%rsi) cmovcq %rbp,%r13 movq %r11,72(%rsi) movq %r12,80(%rsi) movq %r13,88(%rsi) movq 56+0(%rsp),%r15 .cfi_restore %r15 movq 56+8(%rsp),%r14 .cfi_restore %r14 movq 56+16(%rsp),%r13 .cfi_restore %r13 movq 56+24(%rsp),%r12 .cfi_restore %r12 movq 56+32(%rsp),%rbx .cfi_restore %rbx movq 56+40(%rsp),%rbp .cfi_restore %rbp leaq 56+48(%rsp),%rsp .cfi_adjust_cfa_offset -56-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sgn0_pty_mod_384 .private_extern _sgn0_pty_mod_384 .p2align 5 _sgn0_pty_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%rcx movq 40(%rdi),%rdx xorq %rax,%rax movq %r8,%rdi addq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rax subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rax notq %rax andq $1,%rdi andq $2,%rax orq %rdi,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sgn0_pty_mod_384x .private_extern _sgn0_pty_mod_384x .p2align 5 _sgn0_pty_mod_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rdi),%r8 movq 56(%rdi),%r9 movq 64(%rdi),%r10 movq 72(%rdi),%r11 movq 80(%rdi),%rcx movq 88(%rdi),%rdx movq %r8,%rbx orq %r9,%r8 orq %r10,%r8 orq %r11,%r8 orq %rcx,%r8 orq %rdx,%r8 leaq 0(%rdi),%rax xorq %rdi,%rdi movq %rbx,%rbp addq %rbx,%rbx adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rdi subq 0(%rsi),%rbx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rdi movq %r8,0(%rsp) notq %rdi andq $1,%rbp andq $2,%rdi orq %rbp,%rdi movq 0(%rax),%r8 movq 8(%rax),%r9 movq 16(%rax),%r10 movq 24(%rax),%r11 movq 32(%rax),%rcx movq 40(%rax),%rdx movq %r8,%rbx orq %r9,%r8 orq %r10,%r8 orq %r11,%r8 orq %rcx,%r8 orq %rdx,%r8 xorq %rax,%rax movq %rbx,%rbp addq %rbx,%rbx adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %rcx,%rcx adcq %rdx,%rdx adcq $0,%rax subq 0(%rsi),%rbx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 sbbq 24(%rsi),%r11 sbbq 32(%rsi),%rcx sbbq 40(%rsi),%rdx sbbq $0,%rax movq 0(%rsp),%rbx notq %rax testq %r8,%r8 cmovzq %rdi,%rbp testq %rbx,%rbx cmovnzq %rdi,%rax andq $1,%rbp andq $2,%rax orq %rbp,%rax movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_select_32 .private_extern _vec_select_32 .p2align 5 _vec_select_32: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 16(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 16(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 16(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-16(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-16(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-16(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,16-16(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_select_48 .private_extern _vec_select_48 .p2align 5 _vec_select_48: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 24(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 24(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 24(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-24(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-24(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-24(%rdi) pand %xmm4,%xmm2 movdqu 16+16-24(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-24(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-24(%rdi) pand %xmm4,%xmm0 pand %xmm5,%xmm1 por %xmm1,%xmm0 movdqu %xmm0,32-24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_select_96 .private_extern _vec_select_96 .p2align 5 _vec_select_96: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 48(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 48(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 48(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-48(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-48(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-48(%rdi) pand %xmm4,%xmm2 movdqu 16+16-48(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-48(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-48(%rdi) pand %xmm4,%xmm0 movdqu 32+16-48(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-48(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-48(%rdi) pand %xmm4,%xmm2 movdqu 48+16-48(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-48(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-48(%rdi) pand %xmm4,%xmm0 movdqu 64+16-48(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-48(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-48(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,80-48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_select_192 .private_extern _vec_select_192 .p2align 5 _vec_select_192: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 96(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 96(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 96(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-96(%rdi) pand %xmm4,%xmm2 movdqu 16+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-96(%rdi) pand %xmm4,%xmm0 movdqu 32+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-96(%rdi) pand %xmm4,%xmm2 movdqu 48+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-96(%rdi) pand %xmm4,%xmm0 movdqu 64+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-96(%rdi) pand %xmm4,%xmm2 movdqu 80+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-96(%rdi) pand %xmm4,%xmm0 movdqu 96+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-96(%rdi) pand %xmm4,%xmm2 movdqu 112+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-96(%rdi) pand %xmm4,%xmm0 movdqu 128+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 128+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,128-96(%rdi) pand %xmm4,%xmm2 movdqu 144+16-96(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 144+16-96(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,144-96(%rdi) pand %xmm4,%xmm0 movdqu 160+16-96(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 160+16-96(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,160-96(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,176-96(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_select_144 .private_extern _vec_select_144 .p2align 5 _vec_select_144: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 72(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 72(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 72(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-72(%rdi) pand %xmm4,%xmm2 movdqu 16+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-72(%rdi) pand %xmm4,%xmm0 movdqu 32+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-72(%rdi) pand %xmm4,%xmm2 movdqu 48+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-72(%rdi) pand %xmm4,%xmm0 movdqu 64+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-72(%rdi) pand %xmm4,%xmm2 movdqu 80+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-72(%rdi) pand %xmm4,%xmm0 movdqu 96+16-72(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-72(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-72(%rdi) pand %xmm4,%xmm2 movdqu 112+16-72(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-72(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-72(%rdi) pand %xmm4,%xmm0 pand %xmm5,%xmm1 por %xmm1,%xmm0 movdqu %xmm0,128-72(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_select_288 .private_extern _vec_select_288 .p2align 5 _vec_select_288: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movd %ecx,%xmm5 pxor %xmm4,%xmm4 pshufd $0,%xmm5,%xmm5 #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rsi),%xmm0 leaq 144(%rsi),%rsi pcmpeqd %xmm4,%xmm5 movdqu (%rdx),%xmm1 leaq 144(%rdx),%rdx pcmpeqd %xmm5,%xmm4 leaq 144(%rdi),%rdi pand %xmm4,%xmm0 movdqu 0+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 0+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,0-144(%rdi) pand %xmm4,%xmm2 movdqu 16+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 16+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,16-144(%rdi) pand %xmm4,%xmm0 movdqu 32+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 32+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,32-144(%rdi) pand %xmm4,%xmm2 movdqu 48+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 48+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,48-144(%rdi) pand %xmm4,%xmm0 movdqu 64+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 64+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,64-144(%rdi) pand %xmm4,%xmm2 movdqu 80+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 80+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,80-144(%rdi) pand %xmm4,%xmm0 movdqu 96+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 96+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,96-144(%rdi) pand %xmm4,%xmm2 movdqu 112+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 112+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,112-144(%rdi) pand %xmm4,%xmm0 movdqu 128+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 128+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,128-144(%rdi) pand %xmm4,%xmm2 movdqu 144+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 144+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,144-144(%rdi) pand %xmm4,%xmm0 movdqu 160+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 160+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,160-144(%rdi) pand %xmm4,%xmm2 movdqu 176+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 176+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,176-144(%rdi) pand %xmm4,%xmm0 movdqu 192+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 192+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,192-144(%rdi) pand %xmm4,%xmm2 movdqu 208+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 208+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,208-144(%rdi) pand %xmm4,%xmm0 movdqu 224+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 224+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,224-144(%rdi) pand %xmm4,%xmm2 movdqu 240+16-144(%rsi),%xmm0 pand %xmm5,%xmm3 movdqu 240+16-144(%rdx),%xmm1 por %xmm3,%xmm2 movdqu %xmm2,240-144(%rdi) pand %xmm4,%xmm0 movdqu 256+16-144(%rsi),%xmm2 pand %xmm5,%xmm1 movdqu 256+16-144(%rdx),%xmm3 por %xmm1,%xmm0 movdqu %xmm0,256-144(%rdi) pand %xmm4,%xmm2 pand %xmm5,%xmm3 por %xmm3,%xmm2 movdqu %xmm2,272-144(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_prefetch .private_extern _vec_prefetch .p2align 5 _vec_prefetch: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa leaq -1(%rdi,%rsi,1),%rsi movq $64,%rax xorq %r8,%r8 #ifdef __SGX_LVI_HARDENING__ lfence #endif prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi cmovaq %r8,%rax prefetchnta (%rdi) leaq (%rdi,%rax,1),%rdi cmpq %rsi,%rdi cmovaq %rsi,%rdi prefetchnta (%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_is_zero_16x .private_extern _vec_is_zero_16x .p2align 5 _vec_is_zero_16x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa shrl $4,%esi #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdi),%xmm0 leaq 16(%rdi),%rdi L$oop_is_zero: decl %esi jz L$oop_is_zero_done movdqu (%rdi),%xmm1 leaq 16(%rdi),%rdi por %xmm1,%xmm0 jmp L$oop_is_zero L$oop_is_zero_done: pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 .byte 102,72,15,126,192 incl %esi testq %rax,%rax cmovnzl %esi,%eax xorl $1,%eax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _vec_is_equal_16x .private_extern _vec_is_equal_16x .p2align 5 _vec_is_equal_16x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa shrl $4,%edx #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm1 subq %rdi,%rsi leaq 16(%rdi),%rdi pxor %xmm1,%xmm0 L$oop_is_equal: decl %edx jz L$oop_is_equal_done movdqu (%rdi),%xmm1 movdqu (%rdi,%rsi,1),%xmm2 leaq 16(%rdi),%rdi pxor %xmm2,%xmm1 por %xmm1,%xmm0 jmp L$oop_is_equal L$oop_is_equal_done: pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 .byte 102,72,15,126,192 incl %edx testq %rax,%rax cmovnzl %edx,%eax xorl $1,%eax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/add_mod_384x384-x86_64.s ================================================ .text .globl _add_mod_384x384 .private_extern _add_mod_384x384 .p2align 5 _add_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 addq 0(%rdx),%r8 movq 56(%rsi),%r15 adcq 8(%rdx),%r9 movq 64(%rsi),%rax adcq 16(%rdx),%r10 movq 72(%rsi),%rbx adcq 24(%rdx),%r11 movq 80(%rsi),%rbp adcq 32(%rdx),%r12 movq 88(%rsi),%rsi adcq 40(%rdx),%r13 movq %r8,0(%rdi) adcq 48(%rdx),%r14 movq %r9,8(%rdi) adcq 56(%rdx),%r15 movq %r10,16(%rdi) adcq 64(%rdx),%rax movq %r12,32(%rdi) movq %r14,%r8 adcq 72(%rdx),%rbx movq %r11,24(%rdi) movq %r15,%r9 adcq 80(%rdx),%rbp movq %r13,40(%rdi) movq %rax,%r10 adcq 88(%rdx),%rsi movq %rbx,%r11 sbbq %rdx,%rdx subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %rbp,%r12 sbbq 16(%rcx),%rax sbbq 24(%rcx),%rbx sbbq 32(%rcx),%rbp movq %rsi,%r13 sbbq 40(%rcx),%rsi sbbq $0,%rdx cmovcq %r8,%r14 cmovcq %r9,%r15 cmovcq %r10,%rax movq %r14,48(%rdi) cmovcq %r11,%rbx movq %r15,56(%rdi) cmovcq %r12,%rbp movq %rax,64(%rdi) cmovcq %r13,%rsi movq %rbx,72(%rdi) movq %rbp,80(%rdi) movq %rsi,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sub_mod_384x384 .private_extern _sub_mod_384x384 .p2align 5 _sub_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/ct_inverse_mod_256-armv8.S ================================================ .text .globl _ct_inverse_mod_256 .private_extern _ct_inverse_mod_256 .align 5 _ct_inverse_mod_256: hint #25 stp x29, x30, [sp,#-10*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] sub sp, sp, #1040 ldp x4, x5, [x1,#8*0] ldp x6, x7, [x1,#8*2] #ifdef __CHERI_PURE_CAPABILITY__ add x1,sp,#16+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #16+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif str x0, [sp] // offload out_ptr ldp x8, x9, [x2,#8*0] ldp x10, x11, [x2,#8*2] stp x4, x5, [x1,#8*0] // copy input to |a| stp x6, x7, [x1,#8*2] stp x8, x9, [x1,#8*4] // copy modulus to |b| stp x10, x11, [x1,#8*6] ////////////////////////////////////////// first iteration bl Lab_approximation_31_256_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 str x12,[x0,#8*8] // initialize |u| with |f0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 str x12, [x0,#8*10] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 ldr x8, [x1,#8*8] // |u| ldr x9, [x1,#8*14] // |v| madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| asr x5, x4, #63 // sign extension stp x4, x5, [x0,#8*4] stp x5, x5, [x0,#8*6] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| asr x5, x4, #63 // sign extension stp x4, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 asr x24, x24, #63 str x24, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 asr x24, x24, #63 // sign extension stp x24, x24, [x0,#8*4] stp x24, x24, [x0,#8*6] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif bl __ab_approximation_31_256 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_256_n_shift_by_31 mov x16, x12 // corrected |f0| mov x17, x13 // corrected |g0| mov x12, x14 // |f1| mov x13, x15 // |g1| add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 // corrected |f1| mov x17, x13 // corrected |g1| add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail ////////////////////////////////////////// two[!] last iterations eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #47 // 31 + 512 % 31 //bl __ab_approximation_62_256 // |a| and |b| are exact, ldr x7, [x1,#8*0] // just load ldr x11, [x1,#8*4] bl __inner_loop_62_256 mov x16, x14 mov x17, x15 ldr x0, [sp] // original out_ptr bl __smul_256x63 bl __smul_512x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x20, x7, x17 // figure out top-most limb ldp x8, x9, [x3,#8*0] adc x23, x23, x25 ldp x10, x11, [x3,#8*2] add x20, x20, x23 // x20 is 1, 0 or -1 asr x19, x20, #63 // sign as mask and x23, x8, x19 // add mod<<256 conditionally and x24, x9, x19 adds x4, x4, x23 and x25, x10, x19 adcs x5, x5, x24 and x26, x11, x19 adcs x6, x6, x25 adcs x7, x22, x26 adc x20, x20, xzr // x20 is 1, 0 or -1 neg x19, x20 orr x20, x20, x19 // excess bit or sign as mask asr x19, x19, #63 // excess bit as mask and x8, x8, x20 // mask |mod| and x9, x9, x20 and x10, x10, x20 and x11, x11, x20 eor x8, x8, x19 // conditionally negate |mod| eor x9, x9, x19 adds x8, x8, x19, lsr#63 eor x10, x10, x19 adcs x9, x9, xzr eor x11, x11, x19 adcs x10, x10, xzr adc x11, x11, xzr adds x4, x4, x8 // final adjustment for |mod|<<256 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*4] adc x7, x7, x11 stp x6, x7, [x0,#8*6] add sp, sp, #1040 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldr x29, [sp],#10*__SIZEOF_POINTER__ hint #29 ret //////////////////////////////////////////////////////////////////////// .align 5 __smul_256x63: ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) ldp x6, x7, [x1,#8*2+64] eor x16, x16, x14 // conditionally negate |f_| (or |g_|) ldr x22, [x1,#8*4+64] eor x4, x4, x14 // conditionally negate |u| (or |v|) sub x16, x16, x14 eor x5, x5, x14 adds x4, x4, x14, lsr#63 eor x6, x6, x14 adcs x5, x5, xzr eor x7, x7, x14 adcs x6, x6, xzr eor x22, x22, x14 umulh x19, x4, x16 adcs x7, x7, xzr umulh x20, x5, x16 adcs x22, x22, xzr umulh x21, x6, x16 mul x4, x4, x16 cmp x16, #0 mul x5, x5, x16 csel x22, x22, xzr, ne mul x6, x6, x16 adds x5, x5, x19 mul x24, x7, x16 adcs x6, x6, x20 adcs x24, x24, x21 adc x26, xzr, xzr ldp x8, x9, [x1,#8*0+112] // load |u| (or |v|) asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) ldp x10, x11, [x1,#8*2+112] eor x17, x17, x14 // conditionally negate |f_| (or |g_|) ldr x23, [x1,#8*4+112] eor x8, x8, x14 // conditionally negate |u| (or |v|) sub x17, x17, x14 eor x9, x9, x14 adds x8, x8, x14, lsr#63 eor x10, x10, x14 adcs x9, x9, xzr eor x11, x11, x14 adcs x10, x10, xzr eor x23, x23, x14 umulh x19, x8, x17 adcs x11, x11, xzr umulh x20, x9, x17 adcs x23, x23, xzr umulh x21, x10, x17 adc x15, xzr, xzr // used in __smul_512x63_tail mul x8, x8, x17 cmp x17, #0 mul x9, x9, x17 csel x23, x23, xzr, ne mul x10, x10, x17 adds x9, x9, x19 mul x25, x11, x17 adcs x10, x10, x20 adcs x25, x25, x21 adc x26, x26, xzr adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*0] adcs x24, x24, x25 stp x6, x24, [x0,#8*2] ret .align 5 __smul_512x63_tail: umulh x24, x7, x16 ldr x5, [x1,#8*19] // load rest of |v| adc x26, x26, xzr ldp x6, x7, [x1,#8*20] and x22, x22, x16 umulh x11, x11, x17 // resume |v|*|g1| chain sub x24, x24, x22 // tie up |u|*|f1| chain asr x25, x24, #63 eor x5, x5, x14 // conditionally negate rest of |v| eor x6, x6, x14 adds x5, x5, x15 eor x7, x7, x14 adcs x6, x6, xzr umulh x19, x23, x17 adc x7, x7, xzr umulh x20, x5, x17 add x11, x11, x26 umulh x21, x6, x17 mul x4, x23, x17 mul x5, x5, x17 adds x4, x4, x11 mul x6, x6, x17 adcs x5, x5, x19 mul x22, x7, x17 adcs x6, x6, x20 adcs x22, x22, x21 adc x23, xzr, xzr // used in the final step adds x4, x4, x24 adcs x5, x5, x25 adcs x6, x6, x25 stp x4, x5, [x0,#8*4] adcs x22, x22, x25 // carry is used in the final step stp x6, x22, [x0,#8*6] ret .align 5 __smul_256_n_shift_by_31: ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) ldp x6, x7, [x1,#8*2+0] eor x25, x12, x24 // conditionally negate |f0| (or |g0|) eor x4, x4, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x5, x5, x24 adds x4, x4, x24, lsr#63 eor x6, x6, x24 adcs x5, x5, xzr eor x7, x7, x24 umulh x19, x4, x25 adcs x6, x6, xzr umulh x20, x5, x25 adc x7, x7, xzr umulh x21, x6, x25 and x24, x24, x25 umulh x22, x7, x25 neg x24, x24 mul x4, x4, x25 mul x5, x5, x25 mul x6, x6, x25 adds x5, x5, x19 mul x7, x7, x25 adcs x6, x6, x20 adcs x7, x7, x21 adc x22, x22, x24 ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) ldp x10, x11, [x1,#8*2+32] eor x25, x13, x24 // conditionally negate |f0| (or |g0|) eor x8, x8, x24 // conditionally negate |a| (or |b|) sub x25, x25, x24 eor x9, x9, x24 adds x8, x8, x24, lsr#63 eor x10, x10, x24 adcs x9, x9, xzr eor x11, x11, x24 umulh x19, x8, x25 adcs x10, x10, xzr umulh x20, x9, x25 adc x11, x11, xzr umulh x21, x10, x25 and x24, x24, x25 umulh x23, x11, x25 neg x24, x24 mul x8, x8, x25 mul x9, x9, x25 mul x10, x10, x25 adds x9, x9, x19 mul x11, x11, x25 adcs x10, x10, x20 adcs x11, x11, x21 adc x23, x23, x24 adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 adcs x7, x7, x11 adc x8, x22, x23 extr x4, x5, x4, #31 extr x5, x6, x5, #31 extr x6, x7, x6, #31 asr x23, x8, #63 // result's sign as mask extr x7, x8, x7, #31 eor x4, x4, x23 // ensure the result is positive eor x5, x5, x23 adds x4, x4, x23, lsr#63 eor x6, x6, x23 adcs x5, x5, xzr eor x7, x7, x23 adcs x6, x6, xzr stp x4, x5, [x0,#8*0] adc x7, x7, xzr stp x6, x7, [x0,#8*2] eor x12, x12, x23 // adjust |f/g| accordingly eor x13, x13, x23 sub x12, x12, x23 sub x13, x13, x23 ret .align 4 __ab_approximation_31_256: ldp x6, x7, [x1,#8*2] ldp x10, x11, [x1,#8*6] ldp x4, x5, [x1,#8*0] ldp x8, x9, [x1,#8*4] Lab_approximation_31_256_loaded: orr x19, x7, x11 // check top-most limbs, ... cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x5, ne orr x19, x7, x11 // and ones before top-most, ... csel x10, x10, x9, ne cmp x19, #0 csel x7, x7, x6, ne csel x11, x11, x10, ne csel x6, x6, x4, ne orr x19, x7, x11 // and one more, ... csel x10, x10, x8, ne clz x19, x19 cmp x19, #64 csel x19, x19, xzr, ne csel x7, x7, x6, ne csel x11, x11, x10, ne neg x20, x19 lslv x7, x7, x19 // align high limbs to the left lslv x11, x11, x19 lsrv x6, x6, x20 lsrv x10, x10, x20 and x6, x6, x20, asr#6 and x10, x10, x20, asr#6 orr x7, x7, x6 orr x11, x11, x10 bfxil x7, x4, #0, #31 bfxil x11, x8, #0, #31 b __inner_loop_31_256 ret .align 4 __inner_loop_31_256: mov x2, #31 mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x23,#0x7FFFFFFF7FFFFFFF Loop_31_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x15 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x15, x15, x13, hs // exchange |fg0| and |fg1| csel x13, x13, x19, hs lsr x7, x7, #1 and x19, x15, x22 and x20, x23, x22 sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x15, x15, x15 // |f1|<<=1 add x13, x13, x20 sub x15, x15, x23 cbnz x2, Loop_31_256 mov x23, #0x7FFFFFFF ubfx x12, x13, #0, #32 ubfx x13, x13, #32, #32 ubfx x14, x15, #0, #32 ubfx x15, x15, #32, #32 sub x12, x12, x23 // remove bias sub x13, x13, x23 sub x14, x14, x23 sub x15, x15, x23 ret .align 4 __inner_loop_62_256: mov x12, #1 // |f0|=1 mov x13, #0 // |g0|=0 mov x14, #0 // |f1|=0 mov x15, #1 // |g1|=1 Loop_62_256: sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 // |b_|-|a_| subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x19, x12 csel x11, x11, x7, hs // |b_| = |a_| csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| mov x20, x13 csel x12, x12, x14, hs // exchange |f0| and |f1| csel x14, x14, x19, hs csel x13, x13, x15, hs // exchange |g0| and |g1| csel x15, x15, x20, hs lsr x7, x7, #1 and x19, x14, x22 and x20, x15, x22 add x14, x14, x14 // |f1|<<=1 add x15, x15, x15 // |g1|<<=1 sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, Loop_62_256 ret ================================================ FILE: build/mach-o/ct_inverse_mod_256-x86_64.s ================================================ .text .globl _ct_inverse_mod_256 .private_extern _ct_inverse_mod_256 .p2align 5 _ct_inverse_mod_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $1072,%rsp .cfi_adjust_cfa_offset 1072 leaq 48+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 0(%rdx),%r12 movq 8(%rdx),%r13 movq 16(%rdx),%r14 movq 24(%rdx),%r15 movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rax,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,64(%rdi) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,72(%rdi) xorq $256,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq 64(%rsi),%r8 movq 104(%rsi),%r12 movq %r8,%r9 imulq 0(%rsp),%r8 movq %r12,%r13 imulq 8(%rsp),%r12 addq %r12,%r8 movq %r8,32(%rdi) sarq $63,%r8 movq %r8,40(%rdi) movq %r8,48(%rdi) movq %r8,56(%rdi) movq %r8,64(%rdi) leaq 64(%rsi),%rsi imulq %rdx,%r9 imulq %rcx,%r13 addq %r13,%r9 movq %r9,72(%rdi) sarq $63,%r9 movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) movq %r9,104(%rdi) xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_256x63 sarq $63,%rbp movq %rbp,40(%rdi) movq %rbp,48(%rdi) movq %rbp,56(%rdi) xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $31,%edx call __ab_approximation_31_256 movq %r12,16(%rsp) movq %r13,24(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_256_n_shift_by_31 movq %rdx,0(%rsp) movq %rcx,8(%rsp) movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 32(%rdi),%rdi call __smulq_256_n_shift_by_31 movq %rdx,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq 64(%rsi),%rsi leaq 32(%rdi),%rdi call __smulq_256x63 movq 16(%rsp),%rdx movq 24(%rsp),%rcx leaq 40(%rdi),%rdi call __smulq_512x63 xorq $256+64,%rsi movl $47,%edx movq 0(%rsi),%r8 movq 32(%rsi),%r10 call __inner_loop_62_256 leaq 64(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulq_512x63 adcq %rbp,%rdx movq 40(%rsp),%rsi movq %rdx,%rax sarq $63,%rdx movq %rdx,%r8 movq %rdx,%r9 #ifdef __SGX_LVI_HARDENING__ lfence #endif andq 0(%rsi),%r8 movq %rdx,%r10 andq 8(%rsi),%r9 andq 16(%rsi),%r10 andq 24(%rsi),%rdx addq %r8,%r12 adcq %r9,%r13 adcq %r10,%r14 adcq %rdx,%r15 adcq $0,%rax movq %rax,%rdx negq %rax orq %rax,%rdx sarq $63,%rax movq %rdx,%r8 movq %rdx,%r9 andq 0(%rsi),%r8 movq %rdx,%r10 andq 8(%rsi),%r9 andq 16(%rsi),%r10 andq 24(%rsi),%rdx xorq %rax,%r8 xorq %rcx,%rcx xorq %rax,%r9 subq %rax,%rcx xorq %rax,%r10 xorq %rax,%rdx addq %rcx,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%rdx addq %r8,%r12 adcq %r9,%r13 adcq %r10,%r14 adcq %rdx,%r15 movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 1072(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -1072-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulq_512x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%rbp movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%rbp addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%rbp mulq %rbx movq %rax,0(%rdi) movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %r9,8(%rdi) movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %r10,16(%rdi) movq %rdx,%r11 andq %rbx,%rbp negq %rbp mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq %r11,24(%rdi) movq 40(%rsi),%r8 movq 48(%rsi),%r9 movq 56(%rsi),%r10 movq 64(%rsi),%r11 movq 72(%rsi),%r12 movq 80(%rsi),%r13 movq 88(%rsi),%r14 movq 96(%rsi),%r15 movq %rcx,%rdx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rcx addq %rax,%rcx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 mulq %rcx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rcx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rcx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rcx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rcx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rcx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rcx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 imulq %rcx addq %rax,%r15 adcq $0,%rdx movq %rbp,%rbx sarq $63,%rbp addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq %rbx,%r12 adcq %rbp,%r13 adcq %rbp,%r14 adcq %rbp,%r15 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulq_256x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%rbp movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%rbp addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%rbp mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 andq %rbx,%rbp negq %rbp mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq %rcx,%rdx movq 40+0(%rsi),%r12 movq 40+8(%rsi),%r13 movq 40+16(%rsi),%r14 movq 40+24(%rsi),%r15 movq 40+32(%rsi),%rcx movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 xorq %rdx,%rcx addq %r12,%rax adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rcx mulq %rbx movq %rax,%r12 movq %r13,%rax movq %rdx,%r13 mulq %rbx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rbx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 andq %rbx,%rcx negq %rcx mulq %rbx addq %rax,%r15 adcq %rdx,%rcx addq %r12,%r8 adcq %r13,%r9 adcq %r14,%r10 adcq %r15,%r11 adcq %rcx,%rbp movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %rbp,32(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulq_256_n_shift_by_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,0(%rdi) movq %rcx,8(%rdi) movq %rdx,%rbp movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq %rbp,%rbx sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rbx addq %rax,%rbx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 mulq %rbx movq %rax,%r8 movq %r9,%rax andq %rbx,%rbp negq %rbp movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 adcq %rdx,%rbp movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r14 movq 32+24(%rsi),%r15 movq %rcx,%rbx sarq $63,%rcx xorq %rax,%rax subq %rcx,%rax xorq %rcx,%rbx addq %rax,%rbx xorq %rcx,%r12 xorq %rcx,%r13 xorq %rcx,%r14 xorq %rcx,%r15 addq %r12,%rax adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 mulq %rbx movq %rax,%r12 movq %r13,%rax andq %rbx,%rcx negq %rcx movq %rdx,%r13 mulq %rbx addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rbx addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rbx addq %rax,%r15 adcq %rdx,%rcx addq %r12,%r8 adcq %r13,%r9 adcq %r14,%r10 adcq %r15,%r11 adcq %rcx,%rbp movq 0(%rdi),%rdx movq 8(%rdi),%rcx shrdq $31,%r9,%r8 shrdq $31,%r10,%r9 shrdq $31,%r11,%r10 shrdq $31,%rbp,%r11 sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) xorq %rbp,%rdx xorq %rbp,%rcx addq %rax,%rdx addq %rax,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __ab_approximation_31_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 24(%rsi),%r9 movq 56(%rsi),%r11 movq 16(%rsi),%rbx movq 48(%rsi),%rbp movq 8(%rsi),%r8 movq 40(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 0(%rsi),%r8 cmovzq %r10,%rbp movq 32(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r9 cmovzq %r10,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 movl $0x7FFFFFFF,%eax andq %rax,%r8 andq %rax,%r10 notq %rax andq %rax,%r9 andq %rax,%r11 orq %r9,%r8 orq %r11,%r10 jmp __inner_loop_31_256 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __inner_loop_31_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rcx movq $0x800000007FFFFFFF,%r13 movq $0x7FFFFFFF7FFFFFFF,%r15 L$oop_31_256: cmpq %r10,%r8 movq %r8,%rax movq %r10,%rbx movq %rcx,%rbp movq %r13,%r14 cmovbq %r10,%r8 cmovbq %rax,%r10 cmovbq %r13,%rcx cmovbq %rbp,%r13 subq %r10,%r8 subq %r13,%rcx addq %r15,%rcx testq $1,%rax cmovzq %rax,%r8 cmovzq %rbx,%r10 cmovzq %rbp,%rcx cmovzq %r14,%r13 shrq $1,%r8 addq %r13,%r13 subq %r15,%r13 subl $1,%edx jnz L$oop_31_256 shrq $32,%r15 movl %ecx,%edx movl %r13d,%r12d shrq $32,%rcx shrq $32,%r13 subq %r15,%rdx subq %r15,%rcx subq %r15,%r12 subq %r15,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __inner_loop_62_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movl %edx,%r15d movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq %rdx,%r13 movq %rdx,%r14 L$oop_62_256: xorq %rax,%rax testq %r14,%r8 movq %r10,%rbx cmovnzq %r10,%rax subq %r8,%rbx movq %r8,%rbp subq %rax,%r8 cmovcq %rbx,%r8 cmovcq %rbp,%r10 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrq $1,%r8 testq %r14,%rbp cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%r15d jnz L$oop_62_256 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/ct_inverse_mod_384-armv8.S ================================================ .text .globl _ct_inverse_mod_384 .private_extern _ct_inverse_mod_384 .align 5 _ct_inverse_mod_384: hint #25 stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #1056 ldp x22, x4, [x1,#8*0] ldp x5, x6, [x1,#8*2] ldp x7, x8, [x1,#8*4] #ifdef __CHERI_PURE_CAPABILITY__ add x1,sp,#32+511 alignd c1,c1,#9 scbnds c1,c1,#512 #else add x1, sp, #32+511 // find closest 512-byte-aligned spot and x1, x1, #-512 // in the frame... #endif stp x0, x3, [sp] // offload out_ptr, nx_ptr ldp x9, x10, [x2,#8*0] ldp x11, x12, [x2,#8*2] ldp x13, x14, [x2,#8*4] stp x22, x4, [x1,#8*0] // copy input to |a| stp x5, x6, [x1,#8*2] stp x7, x8, [x1,#8*4] stp x9, x10, [x1,#8*6] // copy modulus to |b| stp x11, x12, [x1,#8*8] stp x13, x14, [x1,#8*10] ////////////////////////////////////////// first iteration mov x2, #62 bl Lab_approximation_62_loaded eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 str x15,[x0,#8*12] // initialize |u| with |f0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 str x15, [x0,#8*14] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 ldr x7, [x1,#8*12] // |u| ldr x8, [x1,#8*20] // |v| mul x3, x20, x7 // |u|*|f0| smulh x4, x20, x7 mul x5, x21, x8 // |v|*|g0| smulh x6, x21, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] mul x3, x15, x7 // |u|*|f1| smulh x4, x15, x7 mul x5, x16, x8 // |v|*|g1| smulh x6, x16, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*14] asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*16] stp x5, x5, [x0,#8*18] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 asr x27, x27, #63 str x27, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 asr x27, x27, #63 // sign extension stp x27, x27, [x0,#8*6] stp x27, x27, [x0,#8*8] stp x27, x27, [x0,#8*10] eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif bl __smul_384_n_shift_by_62 mov x20, x15 // corrected |f0| mov x21, x16 // corrected |g0| mov x15, x17 // |f1| mov x16, x19 // |g1| add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // corrected |f1| mov x21, x16 // corrected |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// iteration before last eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #62 //bl __ab_approximation_62 // |a| and |b| are exact, ldp x3, x8, [x1,#8*0] // just load ldp x9, x14, [x1,#8*6] bl __inner_loop_62 eor x0, x1, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 #endif str x3, [x0,#8*0] str x9, [x0,#8*6] mov x20, x15 // exact |f0| mov x21, x16 // exact |g0| mov x15, x17 mov x16, x19 add x0,x0,#8*12 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 // exact |f1| mov x21, x16 // exact |g1| add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// last iteration eor x1, x1, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 #endif mov x2, #24 // 768 % 62 //bl __ab_approximation_62 // |a| and |b| are exact, ldr x3, [x1,#8*0] // just load eor x8, x8, x8 ldr x9, [x1,#8*6] eor x14, x14, x14 bl __inner_loop_62 mov x20, x17 mov x21, x19 ldp x0, x15, [sp] // original out_ptr and n_ptr bl __smul_384x63 bl __smul_768x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x23, x8, x21 // figure out top-most limb adc x26, x26, x28 ldp x9, x10, [x15,#8*0] // load |mod| add x23, x23, x26 // x23 is 1, 0 or -1 ldp x11, x12, [x15,#8*2] asr x22, x23, #63 // sign as mask ldp x13, x14, [x15,#8*4] and x26, x9, x22 // add mod<<384 conditionally and x27, x10, x22 adds x3, x3, x26 and x28, x11, x22 adcs x4, x4, x27 and x2, x12, x22 adcs x5, x5, x28 and x26, x13, x22 adcs x6, x6, x2 and x27, x14, x22 adcs x7, x7, x26 adcs x8, x25, x27 adc x23, x23, xzr // x23 is 1, 0 or -1 neg x22, x23 orr x23, x23, x22 // excess bit or sign as mask asr x22, x22, #63 // excess bit as mask and x9, x9, x23 // mask |mod| and x10, x10, x23 and x11, x11, x23 and x12, x12, x23 and x13, x13, x23 and x14, x14, x23 eor x9, x9, x22 // conditionally negate |mod| eor x10, x10, x22 adds x9, x9, x22, lsr#63 eor x11, x11, x22 adcs x10, x10, xzr eor x12, x12, x22 adcs x11, x11, xzr eor x13, x13, x22 adcs x12, x12, xzr eor x14, x14, x22 adcs x13, x13, xzr adc x14, x14, xzr adds x3, x3, x9 // final adjustment for |mod|<<384 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*6] adcs x7, x7, x13 stp x5, x6, [x0,#8*8] adc x8, x8, x14 stp x7, x8, [x0,#8*10] add sp, sp, #1056 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #29 ret //////////////////////////////////////////////////////////////////////// // see corresponding commentary in ctx_inverse_mod_384-x86_64... .align 5 __smul_384x63: ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) ldp x5, x6, [x1,#8*2+96] eor x20, x20, x17 // conditionally negate |f_| (or |g_|) ldp x7, x8, [x1,#8*4+96] eor x3, x3, x17 // conditionally negate |u| (or |v|) ldr x25, [x1,#8*6+96] sub x20, x20, x17 eor x4, x4, x17 adds x3, x3, x17, lsr#63 eor x5, x5, x17 adcs x4, x4, xzr eor x6, x6, x17 adcs x5, x5, xzr eor x7, x7, x17 adcs x6, x6, xzr umulh x22, x3, x20 eor x8, x8, x17 umulh x23, x4, x20 adcs x7, x7, xzr umulh x24, x5, x20 eor x25, x25, x17 mul x3, x3, x20 adcs x8, x8, xzr mul x4, x4, x20 adcs x25, x25, xzr cmp x20, #0 mul x5, x5, x20 csel x25, x25, xzr, ne adds x4, x4, x22 umulh x22, x6, x20 adcs x5, x5, x23 umulh x23, x7, x20 mul x6, x6, x20 mul x7, x7, x20 adcs x6, x6, x24 mul x27,x8, x20 adcs x7, x7, x22 adcs x27,x27,x23 adc x2, xzr, xzr ldp x9, x10, [x1,#8*0+160] // load |u| (or |v|) asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) ldp x11, x12, [x1,#8*2+160] eor x21, x21, x17 // conditionally negate |f_| (or |g_|) ldp x13, x14, [x1,#8*4+160] eor x9, x9, x17 // conditionally negate |u| (or |v|) ldr x26, [x1,#8*6+160] sub x21, x21, x17 eor x10, x10, x17 adds x9, x9, x17, lsr#63 eor x11, x11, x17 adcs x10, x10, xzr eor x12, x12, x17 adcs x11, x11, xzr eor x13, x13, x17 adcs x12, x12, xzr umulh x22, x9, x21 eor x14, x14, x17 umulh x23, x10, x21 adcs x13, x13, xzr umulh x24, x11, x21 eor x26, x26, x17 mul x9, x9, x21 adcs x14, x14, xzr mul x10, x10, x21 adcs x26, x26, xzr adc x19, xzr, xzr // used in __smul_768x63_tail cmp x21, #0 mul x11, x11, x21 csel x26, x26, xzr, ne adds x10, x10, x22 umulh x22, x12, x21 adcs x11, x11, x23 umulh x23, x13, x21 mul x12, x12, x21 mul x13, x13, x21 adcs x12, x12, x24 mul x28,x14, x21 adcs x13, x13, x22 adcs x28,x28,x23 adc x2, x2, xzr adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*0] adcs x7, x7, x13 stp x5, x6, [x0,#8*2] adcs x27, x27, x28 stp x7, x27, [x0,#8*4] ret .align 5 __smul_768x63_tail: umulh x27, x8, x20 ldr x4, [x1,#8*27]// load rest of |v| adc x2, x2, xzr ldp x5, x6, [x1,#8*28] and x25, x25, x20 ldp x7, x8, [x1,#8*30] sub x27, x27, x25 // tie up |u|*|f1| chain umulh x14, x14, x21 // resume |v|*|g1| chain eor x4, x4, x17 // conditionally negate rest of |v| eor x5, x5, x17 eor x6, x6, x17 adds x4, x4, x19 eor x7, x7, x17 adcs x5, x5, xzr eor x8, x8, x17 adcs x6, x6, xzr umulh x22, x26, x21 adcs x7, x7, xzr umulh x23, x4, x21 adc x8, x8, xzr umulh x24, x5, x21 add x14, x14, x2 umulh x25, x6, x21 asr x28, x27, #63 umulh x2, x7, x21 mul x3, x26, x21 mul x4, x4, x21 mul x5, x5, x21 adds x3, x3, x14 mul x6, x6, x21 adcs x4, x4, x22 mul x7, x7, x21 adcs x5, x5, x23 mul x22, x8, x21 adcs x6, x6, x24 adcs x7, x7, x25 adcs x25, x22, x2 adc x26, xzr, xzr // used in the final step adds x3, x3, x27 adcs x4, x4, x28 adcs x5, x5, x28 adcs x6, x6, x28 stp x3, x4, [x0,#8*6] adcs x7, x7, x28 stp x5, x6, [x0,#8*8] adcs x25, x25, x28 // carry is used in the final step stp x7, x25, [x0,#8*10] ret .align 5 __smul_384_n_shift_by_62: ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) ldp x5, x6, [x1,#8*2+0] eor x2, x15, x28 // conditionally negate |f0| (or |g0|) ldp x7, x8, [x1,#8*4+0] eor x3, x3, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 umulh x22, x3, x2 adcs x6, x6, xzr umulh x23, x4, x2 eor x8, x8, x28 mul x3, x3, x2 adcs x7, x7, xzr mul x4, x4, x2 adc x8, x8, xzr umulh x24, x5, x2 and x28, x28, x2 umulh x25, x6, x2 adds x4, x4, x22 mul x5, x5, x2 umulh x22, x7, x2 neg x28, x28 mul x6, x6, x2 adcs x5, x5, x23 umulh x23, x8, x2 mul x7, x7, x2 adcs x6, x6, x24 mul x8, x8, x2 adcs x7, x7, x25 adcs x8, x8, x22 adc x27, x23, x28 ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) ldp x11, x12, [x1,#8*2+48] eor x2, x16, x28 // conditionally negate |f0| (or |g0|) ldp x13, x14, [x1,#8*4+48] eor x9, x9, x28 // conditionally negate |a| (or |b|) sub x2, x2, x28 eor x10, x10, x28 adds x9, x9, x28, lsr#63 eor x11, x11, x28 adcs x10, x10, xzr eor x12, x12, x28 adcs x11, x11, xzr eor x13, x13, x28 umulh x22, x9, x2 adcs x12, x12, xzr umulh x23, x10, x2 eor x14, x14, x28 mul x9, x9, x2 adcs x13, x13, xzr mul x10, x10, x2 adc x14, x14, xzr umulh x24, x11, x2 and x28, x28, x2 umulh x25, x12, x2 adds x10, x10, x22 mul x11, x11, x2 umulh x22, x13, x2 neg x28, x28 mul x12, x12, x2 adcs x11, x11, x23 umulh x23, x14, x2 mul x13, x13, x2 adcs x12, x12, x24 mul x14, x14, x2 adcs x13, x13, x25 adcs x14, x14, x22 adc x28, x23, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x27, x28 extr x3, x4, x3, #62 extr x4, x5, x4, #62 extr x5, x6, x5, #62 asr x28, x9, #63 extr x6, x7, x6, #62 extr x7, x8, x7, #62 extr x8, x9, x8, #62 eor x3, x3, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 adcs x6, x6, xzr eor x8, x8, x28 stp x3, x4, [x0,#8*0] adcs x7, x7, xzr stp x5, x6, [x0,#8*2] adc x8, x8, xzr stp x7, x8, [x0,#8*4] eor x15, x15, x28 eor x16, x16, x28 sub x15, x15, x28 sub x16, x16, x28 ret .align 4 __ab_approximation_62: ldp x7, x8, [x1,#8*4] ldp x13, x14, [x1,#8*10] ldp x5, x6, [x1,#8*2] ldp x11, x12, [x1,#8*8] Lab_approximation_62_loaded: orr x22, x8, x14 // check top-most limbs, ... cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x22, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne ldp x3, x4, [x1,#8*0] ldp x9, x10, [x1,#8*6] cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x22, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x22, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x22, x8, x14 csel x13, x13, x10, ne clz x22, x22 cmp x22, #64 csel x22, x22, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x23, x22 lslv x8, x8, x22 // align high limbs to the left lslv x14, x14, x22 lsrv x7, x7, x23 lsrv x13, x13, x23 and x7, x7, x23, asr#6 and x13, x13, x23, asr#6 orr x8, x8, x7 orr x14, x14, x13 b __inner_loop_62 ret .align 4 __inner_loop_62: mov x15, #1 // |f0|=1 mov x16, #0 // |g0|=0 mov x17, #0 // |f1|=0 mov x19, #1 // |g1|=1 Loop_62: sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting sub x2, x2, #1 subs x24, x9, x3 // |b_|-|a_| and x22, x9, x28 sbc x25, x14, x8 and x23, x14, x28 subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) mov x22, x15 sbcs x27, x8, x23 mov x23, x16 csel x9, x9, x3, hs // |b_| = |a_| csel x14, x14, x8, hs csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x8, x27, x25, hs csel x15, x15, x17, hs // exchange |f0| and |f1| csel x17, x17, x22, hs csel x16, x16, x19, hs // exchange |g0| and |g1| csel x19, x19, x23, hs extr x3, x8, x3, #1 lsr x8, x8, #1 and x22, x17, x28 and x23, x19, x28 add x17, x17, x17 // |f1|<<=1 add x19, x19, x19 // |g1|<<=1 sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) cbnz x2, Loop_62 ret ================================================ FILE: build/mach-o/ct_is_square_mod_384-armv8.S ================================================ .text .globl _ct_is_square_mod_384 .private_extern _ct_is_square_mod_384 .align 5 _ct_is_square_mod_384: hint #25 stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #512 ldp x3, x4, [x0,#8*0] // load input ldp x5, x6, [x0,#8*2] ldp x7, x8, [x0,#8*4] add x0, sp, #255 // find closest 256-byte-aligned spot and x0, x0, #-256 // in the frame... #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif ldp x9, x10, [x1,#8*0] // load modulus ldp x11, x12, [x1,#8*2] ldp x13, x14, [x1,#8*4] stp x3, x4, [x0,#8*6] // copy input to |a| stp x5, x6, [x0,#8*8] stp x7, x8, [x0,#8*10] stp x9, x10, [x0,#8*0] // copy modulus to |b| stp x11, x12, [x0,#8*2] stp x13, x14, [x0,#8*4] eor x2, x2, x2 // init the Legendre symbol mov x15, #24 // 24 is 768/30-1 b Loop_is_square .align 4 Loop_is_square: bl __ab_approximation_30 sub x15, x15, #1 eor x1, x0, #128 // pointer to dst |b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c1,csp,x1 #endif bl __smul_384_n_shift_by_30 mov x19, x16 // |f0| mov x20, x17 // |g0| add x1,x1,#8*6 bl __smul_384_n_shift_by_30 ldp x9, x10, [x1,#-8*6] eor x0, x0, #128 // flip-flop src |a|b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 #endif and x27, x27, x9 // if |a| was negative, add x2, x2, x27, lsr#1 // adjust |L| cbnz x15, Loop_is_square ////////////////////////////////////////// last iteration //bl __ab_approximation_30 // |a| and |b| are exact, //ldr x8, [x0,#8*6] // and loaded //ldr x14, [x0,#8*0] mov x15, #48 // 48 is 768%30 + 30 bl __inner_loop_48 ldr x30, [x29,#__SIZEOF_POINTER__] and x0, x2, #1 eor x0, x0, #1 add sp, sp, #512 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #29 ret .align 5 __smul_384_n_shift_by_30: ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) ldp x5, x6, [x0,#8*2+0] eor x20, x20, x27 // conditionally negate |g1| (or |f1|) ldp x7, x8, [x0,#8*4+0] eor x3, x3, x27 // conditionally negate |b| (or |a|) sub x20, x20, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 umulh x21, x3, x20 adcs x6, x6, xzr umulh x22, x4, x20 eor x8, x8, x27 umulh x23, x5, x20 adcs x7, x7, xzr umulh x24, x6, x20 adc x8, x8, xzr umulh x25, x7, x20 and x28, x20, x27 umulh x26, x8, x20 neg x28, x28 mul x3, x3, x20 mul x4, x4, x20 mul x5, x5, x20 adds x4, x4, x21 mul x6, x6, x20 adcs x5, x5, x22 mul x7, x7, x20 adcs x6, x6, x23 mul x8, x8, x20 adcs x7, x7, x24 adcs x8, x8 ,x25 adc x26, x26, x28 ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) ldp x11, x12, [x0,#8*2+48] eor x19, x19, x27 // conditionally negate |g1| (or |f1|) ldp x13, x14, [x0,#8*4+48] eor x9, x9, x27 // conditionally negate |b| (or |a|) sub x19, x19, x27 eor x10, x10, x27 adds x9, x9, x27, lsr#63 eor x11, x11, x27 adcs x10, x10, xzr eor x12, x12, x27 adcs x11, x11, xzr eor x13, x13, x27 umulh x21, x9, x19 adcs x12, x12, xzr umulh x22, x10, x19 eor x14, x14, x27 umulh x23, x11, x19 adcs x13, x13, xzr umulh x24, x12, x19 adc x14, x14, xzr umulh x25, x13, x19 and x28, x19, x27 umulh x27, x14, x19 neg x28, x28 mul x9, x9, x19 mul x10, x10, x19 mul x11, x11, x19 adds x10, x10, x21 mul x12, x12, x19 adcs x11, x11, x22 mul x13, x13, x19 adcs x12, x12, x23 mul x14, x14, x19 adcs x13, x13, x24 adcs x14, x14 ,x25 adc x27, x27, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x26, x27 extr x3, x4, x3, #30 extr x4, x5, x4, #30 extr x5, x6, x5, #30 asr x27, x9, #63 extr x6, x7, x6, #30 extr x7, x8, x7, #30 extr x8, x9, x8, #30 eor x3, x3, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 adcs x6, x6, xzr eor x8, x8, x27 stp x3, x4, [x1,#8*0] adcs x7, x7, xzr stp x5, x6, [x1,#8*2] adc x8, x8, xzr stp x7, x8, [x1,#8*4] ret .align 4 __ab_approximation_30: ldp x13, x14, [x0,#8*4] // |a| is still in registers ldp x11, x12, [x0,#8*2] orr x21, x8, x14 // check top-most limbs, ... cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x6, ne orr x21, x8, x14 // ... ones before top-most, ... csel x13, x13, x12, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x5, ne orr x21, x8, x14 // ... and ones before that ... csel x13, x13, x11, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x4, ne orr x21, x8, x14 // and one more, ... csel x13, x13, x10, ne cmp x21, #0 csel x8, x8, x7, ne csel x14, x14, x13, ne csel x7, x7, x3, ne orr x21, x8, x14 csel x13, x13, x9, ne clz x21, x21 cmp x21, #64 csel x21, x21, xzr, ne csel x8, x8, x7, ne csel x14, x14, x13, ne neg x22, x21 lslv x8, x8, x21 // align high limbs to the left lslv x14, x14, x21 lsrv x7, x7, x22 lsrv x13, x13, x22 and x7, x7, x22, asr#6 and x13, x13, x22, asr#6 orr x8, x8, x7 orr x14, x14, x13 bfxil x8, x3, #0, #32 bfxil x14, x9, #0, #32 b __inner_loop_30 ret .align 4 __inner_loop_30: mov x28, #30 mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov x27,#0x7FFFFFFF7FFFFFFF Loop_30: sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x8, x14 sub x28, x28, #1 and x21, x14, x24 sub x22, x14, x8 // |b_|-|a_| subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 mov x21, x20 csel x14, x14, x8, hs // |b_| = |a_| csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x20, x20, x17, hs // exchange |fg0| and |fg1| csel x17, x17, x21, hs csel x2, x2, x25, hs lsr x8, x8, #1 and x21, x20, x24 and x22, x27, x24 add x23, x14, #2 sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) add x20, x20, x20 // |f1|<<=1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 add x17, x17, x22 sub x20, x20, x27 cbnz x28, Loop_30 mov x27, #0x7FFFFFFF ubfx x16, x17, #0, #32 ubfx x17, x17, #32, #32 ubfx x19, x20, #0, #32 ubfx x20, x20, #32, #32 sub x16, x16, x27 // remove the bias sub x17, x17, x27 sub x19, x19, x27 sub x20, x20, x27 ret .align 4 __inner_loop_48: Loop_48: sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting and x25, x3, x9 sub x15, x15, #1 and x21, x9, x24 sub x22, x9, x3 // |b_|-|a_| subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) add x25, x2, x25, lsr#1 csel x9, x9, x3, hs // |b_| = |a_| csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel x2, x2, x25, hs add x23, x9, #2 lsr x3, x3, #1 add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 cbnz x15, Loop_48 ret ================================================ FILE: build/mach-o/ct_is_square_mod_384-x86_64.s ================================================ .text .globl _ct_is_square_mod_384 .private_extern _ct_is_square_mod_384 .p2align 5 _ct_is_square_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $536,%rsp .cfi_adjust_cfa_offset 536 leaq 24+255(%rsp),%rax andq $-256,%rax #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rbx movq 24(%rsi),%rcx movq 32(%rsi),%rdx movq 40(%rsi),%rdi movq %rax,%rsi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rcx,72(%rax) movq %rdx,80(%rax) movq %rdi,88(%rax) xorq %rbp,%rbp movl $24,%ecx jmp L$oop_is_square .p2align 5 L$oop_is_square: movl %ecx,16(%rsp) call __ab_approximation_30 movq %rax,0(%rsp) movq %rbx,8(%rsp) movq $128+48,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_30 movq 0(%rsp),%rdx movq 8(%rsp),%rcx leaq -48(%rdi),%rdi call __smulq_384_n_shift_by_30 movl 16(%rsp),%ecx xorq $128,%rsi andq 48(%rdi),%r14 shrq $1,%r14 addq %r14,%rbp subl $1,%ecx jnz L$oop_is_square movq 48(%rsi),%r9 call __inner_loop_48 movq $1,%rax andq %rbp,%rax xorq $1,%rax leaq 536(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -536-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulq_384_n_shift_by_30: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %rdx,%r14 andq %rbx,%r14 mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %r14 mulq %rbx addq %rax,%r13 adcq %rdx,%r14 leaq 48(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbx sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbx addq %rax,%rbx xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %rdx,%r15 andq %rbx,%r15 mulq %rbx movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbx addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbx addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %r15 mulq %rbx addq %rax,%r13 adcq %rdx,%r15 leaq -48(%rsi),%rsi addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 shrdq $30,%r9,%r8 shrdq $30,%r10,%r9 shrdq $30,%r11,%r10 shrdq $30,%r12,%r11 shrdq $30,%r13,%r12 shrdq $30,%r14,%r13 sarq $63,%r14 xorq %rbx,%rbx subq %r14,%rbx xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbx,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __ab_approximation_30: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 88(%rsi),%rbx movq 80(%rsi),%r15 movq 72(%rsi),%r14 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r11,%r12 movq 64(%rsi),%r11 cmovzq %r14,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r10,%r12 movq 56(%rsi),%r10 cmovzq %r11,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r9,%r12 movq 48(%rsi),%r9 cmovzq %r10,%r15 movq %r13,%rax orq %rbx,%rax cmovzq %r12,%r13 cmovzq %r15,%rbx cmovzq %r8,%r12 cmovzq %r9,%r15 movq %r13,%rax orq %rbx,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r13 cmovzq %r9,%rbx cmovzq %rax,%rcx negq %rcx shldq %cl,%r12,%r13 shldq %cl,%r15,%rbx movq $0xFFFFFFFF00000000,%rax movl %r8d,%r8d movl %r9d,%r9d andq %rax,%r13 andq %rax,%rbx orq %r13,%r8 orq %rbx,%r9 jmp __inner_loop_30 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __inner_loop_30: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rbx movq $0x800000007FFFFFFF,%rcx leaq -1(%rbx),%r15 movl $30,%edi L$oop_30: movq %r8,%rax andq %r9,%rax shrq $1,%rax cmpq %r9,%r8 movq %r8,%r10 movq %r9,%r11 leaq (%rax,%rbp,1),%rax movq %rbx,%r12 movq %rcx,%r13 movq %rbp,%r14 cmovbq %r9,%r8 cmovbq %r10,%r9 cmovbq %rcx,%rbx cmovbq %r12,%rcx cmovbq %rax,%rbp subq %r9,%r8 subq %rcx,%rbx addq %r15,%rbx testq $1,%r10 cmovzq %r10,%r8 cmovzq %r11,%r9 cmovzq %r12,%rbx cmovzq %r13,%rcx cmovzq %r14,%rbp leaq 2(%r9),%rax shrq $1,%r8 shrq $2,%rax addq %rcx,%rcx leaq (%rax,%rbp,1),%rbp subq %r15,%rcx subl $1,%edi jnz L$oop_30 shrq $32,%r15 movl %ebx,%eax shrq $32,%rbx movl %ecx,%edx shrq $32,%rcx subq %r15,%rax subq %r15,%rbx subq %r15,%rdx subq %r15,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __inner_loop_48: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movl $48,%edi L$oop_48: movq %r8,%rax andq %r9,%rax shrq $1,%rax cmpq %r9,%r8 movq %r8,%r10 movq %r9,%r11 leaq (%rax,%rbp,1),%rax movq %rbp,%r12 cmovbq %r9,%r8 cmovbq %r10,%r9 cmovbq %rax,%rbp subq %r9,%r8 testq $1,%r10 cmovzq %r10,%r8 cmovzq %r11,%r9 cmovzq %r12,%rbp leaq 2(%r9),%rax shrq $1,%r8 shrq $2,%rax addq %rax,%rbp subl $1,%edi jnz L$oop_48 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/ctq_inverse_mod_384-x86_64.s ================================================ .comm ___blst_platform_cap,4 .text .globl _ct_inverse_mod_384 .private_extern _ct_inverse_mod_384 .p2align 5 _ct_inverse_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$ct_inverse_mod_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $1112,%rsp .cfi_adjust_cfa_offset 1112 leaq 88+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 0(%rdx),%r14 movq 8(%rdx),%r15 movq 16(%rdx),%rbx movq 24(%rdx),%rbp movq 32(%rdx),%rsi movq 40(%rdx),%rdi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rbp,72(%rax) movq %rsi,80(%rax) movq %rax,%rsi movq %rdi,88(%rax) movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,96(%rdi) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,104(%rdi) xorq $256,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq 96(%rsi),%rax movq 152(%rsi),%r11 movq %rdx,%rbx movq %rax,%r10 imulq 56(%rsp) movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq 64(%rsp) addq %rax,%r8 adcq %rdx,%r9 movq %r8,48(%rdi) movq %r9,56(%rdi) sarq $63,%r9 movq %r9,64(%rdi) movq %r9,72(%rdi) movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) leaq 96(%rsi),%rsi movq %r10,%rax imulq %rbx movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq %rcx addq %rax,%r8 adcq %rdx,%r9 movq %r8,104(%rdi) movq %r9,112(%rdi) sarq $63,%r9 movq %r9,120(%rdi) movq %r9,128(%rdi) movq %r9,136(%rdi) movq %r9,144(%rdi) movq %r9,152(%rdi) xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_384x63 movq %r14,56(%rdi) movq %r14,64(%rdi) movq %r14,72(%rdi) movq %r14,80(%rdi) movq %r14,88(%rdi) xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi call __ab_approximation_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulq_384_n_shift_by_62 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulq_384_n_shift_by_62 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $62,%edi movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 48(%rsi),%r10 movq 56(%rsi),%r11 call __inner_loop_62 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi movq %r8,0(%rdi) movq %r10,48(%rdi) leaq 96(%rsi),%rsi leaq 96(%rdi),%rdi call __smulq_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulq_768x63 xorq $256+96,%rsi movl $24,%edi movq 0(%rsi),%r8 xorq %r9,%r9 movq 48(%rsi),%r10 xorq %r11,%r11 call __inner_loop_62 leaq 96(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulq_768x63 movq 40(%rsp),%rsi movq %rdx,%r13 sarq $63,%r13 movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %rdx orq %rdx,%r13 sarq $63,%rdx movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 xorq %rdx,%r8 xorq %rsi,%rsi xorq %rdx,%r9 subq %rdx,%rsi xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) leaq 1112(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -1112-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulq_768x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 56(%rsi),%rsi xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,0(%rdi) movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 movq %r9,8(%rdi) mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 movq %r10,16(%rdi) mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 movq %r11,24(%rdi) mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 movq %r12,32(%rdi) mulq %rbp addq %rax,%r13 adcq %rdx,%r14 movq %r13,40(%rdi) movq %r14,48(%rdi) sarq $63,%r14 movq %r14,56(%rdi) movq %rcx,%rdx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq 56(%rsi),%r15 movq 64(%rsi),%rbx movq 72(%rsi),%rbp movq 80(%rsi),%rcx movq 88(%rsi),%rdi movq %rdx,%rsi sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rsi addq %rax,%rsi xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 xorq %rdx,%r15 xorq %rdx,%rbx xorq %rdx,%rbp xorq %rdx,%rcx xorq %rdx,%rdi addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rbx adcq $0,%rbp adcq $0,%rcx adcq $0,%rdi mulq %rsi movq %rax,%r8 movq %r9,%rax movq %rdx,%r9 mulq %rsi addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rsi addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rsi addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rsi addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rsi addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rsi addq %rax,%r14 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rsi addq %rax,%r15 movq %rbx,%rax adcq $0,%rdx movq %rdx,%rbx mulq %rsi addq %rax,%rbx movq %rbp,%rax adcq $0,%rdx movq %rdx,%rbp mulq %rsi addq %rax,%rbp movq %rcx,%rax adcq $0,%rdx movq %rdx,%rcx mulq %rsi addq %rax,%rcx movq %rdi,%rax adcq $0,%rdx movq %rdx,%rdi imulq %rsi movq 8(%rsp),%rsi addq %rdi,%rax adcq $0,%rdx addq 0(%rsi),%r8 adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 adcq 24(%rsi),%r11 adcq 32(%rsi),%r12 adcq 40(%rsi),%r13 adcq 48(%rsi),%r14 movq 56(%rsi),%rdi adcq %rdi,%r15 adcq %rdi,%rbx adcq %rdi,%rbp adcq %rdi,%rcx adcq %rdi,%rax adcq %rdi,%rdx leaq (%rsi),%rdi movq 16(%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulq_384x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq 56(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,%r15 movq %r14,%rbx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 xorq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq -56(%rsi),%rsi addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq %r15,%r13 adcq %rbx,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulq_384_n_shift_by_62: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 movq %rdx,%r14 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r14 negq %r14 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r14 leaq 48(%rsi),%rsi movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rdx,%rbp sarq $63,%rdx xorq %rax,%rax subq %rdx,%rax xorq %rdx,%rbp addq %rax,%rbp xorq %rdx,%r8 xorq %rdx,%r9 xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 movq %rdx,%r15 addq %r8,%rax adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 mulq %rbp movq %rax,%r8 movq %r9,%rax andq %rbp,%r15 negq %r15 movq %rdx,%r9 mulq %rbp addq %rax,%r9 movq %r10,%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%r10 movq %r11,%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r11 movq %r12,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r13,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rbp addq %rax,%r13 adcq %rdx,%r15 leaq -48(%rsi),%rsi movq %rbx,%rdx addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 shrdq $62,%r9,%r8 shrdq $62,%r10,%r9 shrdq $62,%r11,%r10 shrdq $62,%r12,%r11 shrdq $62,%r13,%r12 shrdq $62,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __ab_approximation_62: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 40(%rsi),%r9 movq 88(%rsi),%r11 movq 32(%rsi),%rbx movq 80(%rsi),%rbp movq 24(%rsi),%r8 movq 72(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 16(%rsi),%r8 movq 64(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 8(%rsi),%r8 movq 56(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq 0(%rsi),%r8 movq 48(%rsi),%r10 movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 jmp __inner_loop_62 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 3 .long 0 __inner_loop_62: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq $1,%r13 movq %rsi,8(%rsp) L$oop_62: xorq %rax,%rax xorq %rbx,%rbx testq $1,%r8 movq %r10,%rbp movq %r11,%r14 cmovnzq %r10,%rax cmovnzq %r11,%rbx subq %r8,%rbp sbbq %r9,%r14 movq %r8,%r15 movq %r9,%rsi subq %rax,%r8 sbbq %rbx,%r9 cmovcq %rbp,%r8 cmovcq %r14,%r9 cmovcq %r15,%r10 cmovcq %rsi,%r11 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrdq $1,%r9,%r8 shrq $1,%r9 testq $1,%r15 cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%edi jnz L$oop_62 movq 8(%rsp),%rsi #ifdef __SGX_LVI_HARDENING__ popq %rax lfence jmpq *%rax ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/ctx_inverse_mod_384-x86_64.s ================================================ .text .globl _ctx_inverse_mod_384 .private_extern _ctx_inverse_mod_384 .p2align 5 _ctx_inverse_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$ct_inverse_mod_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $1112,%rsp .cfi_adjust_cfa_offset 1112 leaq 88+511(%rsp),%rax andq $-512,%rax movq %rdi,32(%rsp) movq %rcx,40(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 0(%rdx),%r14 movq 8(%rdx),%r15 movq 16(%rdx),%rbx movq 24(%rdx),%rbp movq 32(%rdx),%rsi movq 40(%rdx),%rdi movq %r8,0(%rax) movq %r9,8(%rax) movq %r10,16(%rax) movq %r11,24(%rax) movq %r12,32(%rax) movq %r13,40(%rax) movq %r14,48(%rax) movq %r15,56(%rax) movq %rbx,64(%rax) movq %rbp,72(%rax) movq %rsi,80(%rax) movq %rax,%rsi movq %rdi,88(%rax) movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,96(%rdi) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,104(%rdi) xorq $256,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq 96(%rsi),%rax movq 152(%rsi),%r11 movq %rdx,%rbx movq %rax,%r10 imulq 56(%rsp) movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq 64(%rsp) addq %rax,%r8 adcq %rdx,%r9 movq %r8,48(%rdi) movq %r9,56(%rdi) sarq $63,%r9 movq %r9,64(%rdi) movq %r9,72(%rdi) movq %r9,80(%rdi) movq %r9,88(%rdi) movq %r9,96(%rdi) leaq 96(%rsi),%rsi movq %r10,%rax imulq %rbx movq %rax,%r8 movq %r11,%rax movq %rdx,%r9 imulq %rcx addq %rax,%r8 adcq %rdx,%r9 movq %r8,104(%rdi) movq %r9,112(%rdi) sarq $63,%r9 movq %r9,120(%rdi) movq %r9,128(%rdi) movq %r9,136(%rdi) movq %r9,144(%rdi) movq %r9,152(%rdi) xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_384x63 movq %r14,56(%rdi) movq %r14,64(%rdi) movq %r14,72(%rdi) movq %r14,80(%rdi) movq %r14,88(%rdi) xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_384_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_384_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $31,%edi call __ab_approximation_31 movq %r12,72(%rsp) movq %r13,80(%rsp) movq $256,%rdi xorq %rsi,%rdi call __smulx_191_n_shift_by_31 movq %rdx,56(%rsp) movq %rcx,64(%rsp) movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 48(%rdi),%rdi call __smulx_191_n_shift_by_31 movq %rdx,72(%rsp) movq %rcx,80(%rsp) movq 56(%rsp),%rdx movq 64(%rsp),%rcx leaq 96(%rsi),%rsi leaq 48(%rdi),%rdi call __smulx_384x63 movq 72(%rsp),%rdx movq 80(%rsp),%rcx leaq 56(%rdi),%rdi call __smulx_768x63 xorq $256+96,%rsi movl $55,%edi movq 0(%rsi),%r8 movq 48(%rsi),%r10 call __tail_loop_55 leaq 96(%rsi),%rsi movq %r12,%rdx movq %r13,%rcx movq 32(%rsp),%rdi call __smulx_768x63 movq 40(%rsp),%rsi movq %rdx,%r13 sarq $63,%r13 movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax adcq $0,%rdx movq %rdx,%r13 negq %rdx orq %rdx,%r13 sarq $63,%rdx movq %r13,%r8 movq %r13,%r9 movq %r13,%r10 andq 0(%rsi),%r8 andq 8(%rsi),%r9 movq %r13,%r11 andq 16(%rsi),%r10 andq 24(%rsi),%r11 movq %r13,%r12 andq 32(%rsi),%r12 andq 40(%rsi),%r13 xorq %rdx,%r8 xorq %rsi,%rsi xorq %rdx,%r9 subq %rdx,%rsi xorq %rdx,%r10 xorq %rdx,%r11 xorq %rdx,%r12 xorq %rdx,%r13 addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 addq %r8,%r14 adcq %r9,%r15 adcq %r10,%rbx adcq %r11,%rbp adcq %r12,%rcx adcq %r13,%rax movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) leaq 1112(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -1112-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulx_768x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 56(%rsi),%rsi xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 xorq %rax,%r14 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax adcq %rbp,%r13 adcq %rax,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) sarq $63,%r14 movq %r14,56(%rdi) movq %rcx,%rdx movq %rcx,%rax movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq 56(%rsi),%r15 movq 64(%rsi),%rbx movq 72(%rsi),%rbp movq 80(%rsi),%rcx movq 88(%rsi),%rdi sarq $63,%rax xorq %rsi,%rsi subq %rax,%rsi xorq %rax,%rdx addq %rsi,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 xorq %rax,%r14 xorq %rax,%r15 xorq %rax,%rbx xorq %rax,%rbp xorq %rax,%rcx xorq %rdi,%rax addq %rsi,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rbx adcq $0,%rbp adcq $0,%rcx adcq $0,%rax mulxq %r8,%r8,%rsi mulxq %r9,%r9,%rdi addq %rsi,%r9 mulxq %r10,%r10,%rsi adcq %rdi,%r10 mulxq %r11,%r11,%rdi adcq %rsi,%r11 mulxq %r12,%r12,%rsi adcq %rdi,%r12 mulxq %r13,%r13,%rdi adcq %rsi,%r13 mulxq %r14,%r14,%rsi adcq %rdi,%r14 mulxq %r15,%r15,%rdi adcq %rsi,%r15 mulxq %rbx,%rbx,%rsi adcq %rdi,%rbx mulxq %rbp,%rbp,%rdi adcq %rsi,%rbp mulxq %rcx,%rcx,%rsi adcq %rdi,%rcx movq 8(%rsp),%rdi adcq $0,%rsi imulq %rdx addq %rsi,%rax adcq $0,%rdx addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 movq 56(%rdi),%rsi adcq %rsi,%r15 adcq %rsi,%rbx adcq %rsi,%rbp adcq %rsi,%rcx adcq %rsi,%rax adcq %rsi,%rdx movq 16(%rsp),%rsi movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq %rbx,64(%rdi) movq %rbp,72(%rdi) movq %rcx,80(%rdi) movq %rax,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulx_384x63: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%r12 movq 0+40(%rsi),%r13 movq 0+48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rdx addq %rax,%rdx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 xorq %rbp,%r12 xorq %rbp,%r13 xorq %rbp,%r14 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax movq %rcx,%rdx adcq %rbp,%r13 adcq %rax,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,%r15 movq %r14,%rbx movq 56+0(%rsi),%r8 movq 56+8(%rsi),%r9 movq 56+16(%rsi),%r10 movq 56+24(%rsi),%r11 movq 56+32(%rsi),%r12 movq 56+40(%rsi),%r13 movq 56+48(%rsi),%r14 movq %rdx,%rbp sarq $63,%rbp xorq %rax,%rax subq %rbp,%rax xorq %rbp,%rdx addq %rax,%rdx xorq %rbp,%r8 xorq %rbp,%r9 xorq %rbp,%r10 xorq %rbp,%r11 xorq %rbp,%r12 xorq %rbp,%r13 xorq %rbp,%r14 addq %rax,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 andq %rdx,%r14 negq %r14 mulxq %r8,%r8,%rbp mulxq %r9,%r9,%rax addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %rax,%r10 mulxq %r11,%r11,%rax adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %rax,%r12 mulxq %r13,%r13,%rax adcq %rbp,%r13 adcq %rax,%r14 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq %r15,%r13 adcq %rbx,%r14 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulx_384_n_shift_by_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq 0+24(%rsi),%r11 movq 0+32(%rsi),%r12 movq 0+40(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 andq %rdx,%rax negq %rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r14 addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %r14,%r10 mulxq %r11,%r11,%r14 adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %r14,%r12 mulxq %r13,%r13,%r14 adcq %rbp,%r13 adcq %rax,%r14 movq %rcx,%rdx movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,%r15 movq 48+0(%rsi),%r8 movq 48+8(%rsi),%r9 movq 48+16(%rsi),%r10 movq 48+24(%rsi),%r11 movq 48+32(%rsi),%r12 movq 48+40(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %rax,%r10 xorq %rax,%r11 xorq %rax,%r12 xorq %rax,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 andq %rdx,%rax negq %rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r14 addq %rbp,%r9 mulxq %r10,%r10,%rbp adcq %r14,%r10 mulxq %r11,%r11,%r14 adcq %rbp,%r11 mulxq %r12,%r12,%rbp adcq %r14,%r12 mulxq %r13,%r13,%r14 adcq %rbp,%r13 adcq %rax,%r14 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq %r15,%r14 movq %rbx,%rdx shrdq $31,%r9,%r8 shrdq $31,%r10,%r9 shrdq $31,%r11,%r10 shrdq $31,%r12,%r11 shrdq $31,%r13,%r12 shrdq $31,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r8 xorq %r14,%r9 xorq %r14,%r10 xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __smulx_191_n_shift_by_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rdx,%rbx movq 0+0(%rsi),%r8 movq 0+8(%rsi),%r9 movq 0+16(%rsi),%r10 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r8 xorq %rax,%r9 xorq %r10,%rax addq %rbp,%r8 adcq $0,%r9 adcq $0,%rax mulxq %r8,%r8,%rbp mulxq %r9,%r9,%r10 addq %rbp,%r9 adcq $0,%r10 imulq %rdx addq %rax,%r10 adcq $0,%rdx movq %rdx,%r14 movq %rcx,%rdx movq 48+0(%rsi),%r11 movq 48+8(%rsi),%r12 movq 48+16(%rsi),%r13 movq %rdx,%rax sarq $63,%rax xorq %rbp,%rbp subq %rax,%rbp xorq %rax,%rdx addq %rbp,%rdx xorq %rax,%r11 xorq %rax,%r12 xorq %r13,%rax addq %rbp,%r11 adcq $0,%r12 adcq $0,%rax mulxq %r11,%r11,%rbp mulxq %r12,%r12,%r13 addq %rbp,%r12 adcq $0,%r13 imulq %rdx addq %rax,%r13 adcq $0,%rdx addq %r8,%r11 adcq %r9,%r12 adcq %r10,%r13 adcq %rdx,%r14 movq %rbx,%rdx shrdq $31,%r12,%r11 shrdq $31,%r13,%r12 shrdq $31,%r14,%r13 sarq $63,%r14 xorq %rbp,%rbp subq %r14,%rbp xorq %r14,%r11 xorq %r14,%r12 xorq %r14,%r13 addq %rbp,%r11 adcq $0,%r12 adcq $0,%r13 movq %r11,0(%rdi) movq %r12,8(%rdi) movq %r13,16(%rdi) xorq %r14,%rdx xorq %r14,%rcx addq %rbp,%rdx addq %rbp,%rcx #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __ab_approximation_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 40(%rsi),%r9 movq 88(%rsi),%r11 movq 32(%rsi),%rbx movq 80(%rsi),%rbp movq 24(%rsi),%r8 movq 72(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 16(%rsi),%r8 cmovzq %r10,%rbp movq 64(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 8(%rsi),%r8 cmovzq %r10,%rbp movq 56(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx movq 0(%rsi),%r8 cmovzq %r10,%rbp movq 48(%rsi),%r10 movq %r9,%rax orq %r11,%rax cmovzq %rbx,%r9 cmovzq %rbp,%r11 cmovzq %r8,%rbx cmovzq %r10,%rbp movq %r9,%rax orq %r11,%rax bsrq %rax,%rcx leaq 1(%rcx),%rcx cmovzq %r8,%r9 cmovzq %r10,%r11 cmovzq %rax,%rcx negq %rcx shldq %cl,%rbx,%r9 shldq %cl,%rbp,%r11 movl $0x7FFFFFFF,%eax andq %rax,%r8 andq %rax,%r10 andnq %r9,%rax,%r9 andnq %r11,%rax,%r11 orq %r9,%r8 orq %r11,%r10 jmp __inner_loop_31 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __inner_loop_31: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $0x7FFFFFFF80000000,%rcx movq $0x800000007FFFFFFF,%r13 movq $0x7FFFFFFF7FFFFFFF,%r15 L$oop_31: cmpq %r10,%r8 movq %r8,%rax movq %r10,%rbx movq %rcx,%rbp movq %r13,%r14 cmovbq %r10,%r8 cmovbq %rax,%r10 cmovbq %r13,%rcx cmovbq %rbp,%r13 subq %r10,%r8 subq %r13,%rcx addq %r15,%rcx testq $1,%rax cmovzq %rax,%r8 cmovzq %rbx,%r10 cmovzq %rbp,%rcx cmovzq %r14,%r13 shrq $1,%r8 addq %r13,%r13 subq %r15,%r13 subl $1,%edi jnz L$oop_31 shrq $32,%r15 movl %ecx,%edx movl %r13d,%r12d shrq $32,%rcx shrq $32,%r13 subq %r15,%rdx subq %r15,%rcx subq %r15,%r12 subq %r15,%r13 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __tail_loop_55: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rdx xorq %rcx,%rcx xorq %r12,%r12 movq $1,%r13 L$oop_55: xorq %rax,%rax testq $1,%r8 movq %r10,%rbx cmovnzq %r10,%rax subq %r8,%rbx movq %r8,%rbp subq %rax,%r8 cmovcq %rbx,%r8 cmovcq %rbp,%r10 movq %rdx,%rax cmovcq %r12,%rdx cmovcq %rax,%r12 movq %rcx,%rbx cmovcq %r13,%rcx cmovcq %rbx,%r13 xorq %rax,%rax xorq %rbx,%rbx shrq $1,%r8 testq $1,%rbp cmovnzq %r12,%rax cmovnzq %r13,%rbx addq %r12,%r12 addq %r13,%r13 subq %rax,%rdx subq %rbx,%rcx subl $1,%edi jnz L$oop_55 #ifdef __SGX_LVI_HARDENING__ popq %r8 lfence jmpq *%r8 ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/div3w-armv8.S ================================================ .text .globl _div_3_limbs .private_extern _div_3_limbs .align 5 _div_3_limbs: hint #34 ldp x4,x5,[x0] // load R eor x0,x0,x0 // Q = 0 mov x3,#64 // loop counter nop Loop: subs x6,x4,x1 // R - D add x0,x0,x0 // Q <<= 1 sbcs x7,x5,x2 add x0,x0,#1 // Q + speculative bit csel x4,x4,x6,lo // select between R and R - D extr x1,x2,x1,#1 // D >>= 1 csel x5,x5,x7,lo lsr x2,x2,#1 sbc x0,x0,xzr // subtract speculative bit sub x3,x3,#1 cbnz x3,Loop asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit orr x0,x0,x3 // all ones if overflow ret .globl _quot_rem_128 .private_extern _quot_rem_128 .align 5 _quot_rem_128: hint #34 ldp x3,x4,[x1] mul x5,x3,x2 // divisor[0:1} * quotient umulh x6,x3,x2 mul x11, x4,x2 umulh x7,x4,x2 ldp x8,x9,[x0] // load 3 limbs of the dividend ldr x10,[x0,#16] adds x6,x6,x11 adc x7,x7,xzr subs x8,x8,x5 // dividend - divisor * quotient sbcs x9,x9,x6 sbcs x10,x10,x7 sbc x5,xzr,xzr // borrow -> mask add x2,x2,x5 // if borrowed, adjust the quotient ... and x3,x3,x5 and x4,x4,x5 adds x8,x8,x3 // ... and add divisor adc x9,x9,x4 stp x8,x9,[x0] // save 2 limbs of the remainder str x2,[x0,#16] // and one limb of the quotient mov x0,x2 // return adjusted quotient ret .globl _quot_rem_64 .private_extern _quot_rem_64 .align 5 _quot_rem_64: hint #34 ldr x3,[x1] ldr x8,[x0] // load 1 limb of the dividend mul x5,x3,x2 // divisor * quotient sub x8,x8,x5 // dividend - divisor * quotient stp x8,x2,[x0] // save remainder and quotient mov x0,x2 // return quotient ret ================================================ FILE: build/mach-o/div3w-x86_64.s ================================================ .text .globl _div_3_limbs .private_extern _div_3_limbs .p2align 5 _div_3_limbs: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq (%rdi),%r8 movq 8(%rdi),%r9 xorq %rax,%rax movl $64,%ecx L$oop: movq %r8,%r10 subq %rsi,%r8 movq %r9,%r11 sbbq %rdx,%r9 leaq 1(%rax,%rax,1),%rax movq %rdx,%rdi cmovcq %r10,%r8 cmovcq %r11,%r9 sbbq $0,%rax shlq $63,%rdi shrq $1,%rsi shrq $1,%rdx orq %rdi,%rsi subl $1,%ecx jnz L$oop leaq 1(%rax,%rax,1),%rcx sarq $63,%rax subq %rsi,%r8 sbbq %rdx,%r9 sbbq $0,%rcx orq %rcx,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _quot_rem_128 .private_extern _quot_rem_128 .p2align 5 _quot_rem_128: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq %rdx,%rax movq %rdx,%rcx mulq 0(%rsi) movq %rax,%r8 movq %rcx,%rax movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r9 adcq $0,%rdx movq 0(%rdi),%r10 movq 8(%rdi),%r11 movq 16(%rdi),%rax subq %r8,%r10 sbbq %r9,%r11 sbbq %rdx,%rax sbbq %r8,%r8 addq %r8,%rcx movq %r8,%r9 andq 0(%rsi),%r8 andq 8(%rsi),%r9 addq %r8,%r10 adcq %r9,%r11 movq %r10,0(%rdi) movq %r11,8(%rdi) movq %rcx,16(%rdi) movq %rcx,%rax #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _quot_rem_64 .private_extern _quot_rem_64 .p2align 5 _quot_rem_64: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq %rdx,%rax imulq 0(%rsi),%rdx movq 0(%rdi),%r10 subq %rdx,%r10 movq %r10,0(%rdi) movq %rax,8(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/mul_mont_256-armv8.S ================================================ .text .globl _mul_mont_sparse_256 .private_extern _mul_mont_sparse_256 .align 5 _mul_mont_sparse_256: hint #34 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldr x9, [x2] ldp x12,x13,[x1,#16] mul x19,x10,x9 ldp x5,x6,[x3] mul x20,x11,x9 ldp x7,x8,[x3,#16] mul x21,x12,x9 mul x22,x13,x9 umulh x14,x10,x9 umulh x15,x11,x9 mul x3,x4,x19 umulh x16,x12,x9 umulh x17,x13,x9 adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,xzr, x17 mul x17,x8,x3 ldr x9,[x2,8*1] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*2] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*3] subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 //mul x14,x5,x3 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 subs xzr,x19,#1 //adds x19,x19,x14 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 adcs x20,x21,x15 adcs x21,x22,x16 adcs x22,x23,x17 adc x23,xzr,xzr subs x14,x19,x5 sbcs x15,x20,x6 sbcs x16,x21,x7 sbcs x17,x22,x8 sbcs xzr, x23,xzr csel x19,x19,x14,lo csel x20,x20,x15,lo csel x21,x21,x16,lo csel x22,x22,x17,lo stp x19,x20,[x0] stp x21,x22,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ ret .globl _sqr_mont_sparse_256 .private_extern _sqr_mont_sparse_256 .align 5 _sqr_mont_sparse_256: hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x5,x6,[x1] ldp x7,x8,[x1,#16] mov x4,x3 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul x11,x6,x5 // a[1]*a[0] umulh x15,x6,x5 mul x12,x7,x5 // a[2]*a[0] umulh x16,x7,x5 mul x13,x8,x5 // a[3]*a[0] umulh x19,x8,x5 adds x12,x12,x15 // accumulate high parts of multiplication mul x14,x7,x6 // a[2]*a[1] umulh x15,x7,x6 adcs x13,x13,x16 mul x16,x8,x6 // a[3]*a[1] umulh x17,x8,x6 adc x19,x19,xzr // can't overflow mul x20,x8,x7 // a[3]*a[2] umulh x21,x8,x7 adds x15,x15,x16 // accumulate high parts of multiplication mul x10,x5,x5 // a[0]*a[0] adc x16,x17,xzr // can't overflow adds x13,x13,x14 // accumulate low parts of multiplication umulh x5,x5,x5 adcs x19,x19,x15 mul x15,x6,x6 // a[1]*a[1] adcs x20,x20,x16 umulh x6,x6,x6 adc x21,x21,xzr // can't overflow adds x11,x11,x11 // acc[1-6]*=2 mul x16,x7,x7 // a[2]*a[2] adcs x12,x12,x12 umulh x7,x7,x7 adcs x13,x13,x13 mul x17,x8,x8 // a[3]*a[3] adcs x19,x19,x19 umulh x8,x8,x8 adcs x20,x20,x20 adcs x21,x21,x21 adc x22,xzr,xzr adds x11,x11,x5 // +a[i]*a[i] adcs x12,x12,x15 adcs x13,x13,x6 adcs x19,x19,x16 adcs x20,x20,x7 adcs x21,x21,x17 adc x22,x22,x8 bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] adds x10,x10,x19 // accumulate upper half adcs x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adc x19,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x19,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret .globl _from_mont_256 .private_extern _from_mont_256 .align 5 _from_mont_256: hint #25 stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #29 ret .globl _redc_mont_256 .private_extern _redc_mont_256 .align 5 _redc_mont_256: hint #25 stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x14,x15,[x1,#32] ldp x16,x17,[x1,#48] adds x10,x10,x14 adcs x11,x11,x15 adcs x12,x12,x16 adcs x13,x13,x17 adc x9,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x9,xzr csel x10,x10,x14,lo csel x11,x11,x15,lo csel x12,x12,x16,lo csel x13,x13,x17,lo stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #29 ret .align 5 __mul_by_1_mont_256: mul x3,x4,x10 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 //mul x14,x5,x3 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 //adds x10,x10,x14 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 adc x13,x9,x17 ret ================================================ FILE: build/mach-o/mul_mont_384-armv8.S ================================================ .text .globl _add_mod_384x384 .private_extern _add_mod_384x384 .align 5 _add_mod_384x384: hint #25 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __add_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #29 ret .align 5 __add_mod_384x384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 stp x11, x12, [x0] adcs x15,x15,x23 ldp x11, x12, [x1,#48] adcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] adcs x11,x11,x19 stp x15, x16, [x0,#32] adcs x12,x12,x20 ldp x15, x16, [x1,#80] adcs x13,x13,x21 ldp x23,x24,[x2,#80] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo stp x11,x12,[x0,#48] csel x15,x15,x23,lo stp x13,x14,[x0,#64] csel x16,x16,x24,lo stp x15,x16,[x0,#80] ret .globl _sub_mod_384x384 .private_extern _sub_mod_384x384 .align 5 _sub_mod_384x384: hint #25 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #29 ret .align 5 __sub_mod_384x384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 stp x11, x12, [x0] sbcs x15,x15,x23 ldp x11, x12, [x1,#48] sbcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] sbcs x11,x11,x19 stp x15, x16, [x0,#32] sbcs x12,x12,x20 ldp x15, x16, [x1,#80] sbcs x13,x13,x21 ldp x23,x24,[x2,#80] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] ret .align 5 __add_mod_384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo stp x11,x12,[x0] csel x16,x16,x24,lo stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .align 5 __sub_mod_384: ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0] adc x16,x16,x24 stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .globl _mul_mont_384x .private_extern _mul_mont_384x .align 5 _mul_mont_384x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#288 // space for 3 768-bit vectors mov x26,x0 // save r_ptr mov x27,x1 // save b_ptr mov x28,x2 // save b_ptr add x0,sp,#0 bl __mul_384 add x1,x1,#48 add x2,x2,#48 add x0,sp,#96 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] sub x2,x1,#48 add x0,sp,#240 bl __add_mod_384 add x1,x28,#0 add x2,x28,#48 add x0,sp,#192 bl __add_mod_384 add x1,x0,#0 add x2,x0,#48 bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,x0 add x2,sp,#0 bl __sub_mod_384x384 add x2,sp,#96 bl __sub_mod_384x384 // t2 = t2-t0-t1 add x1,sp,#0 add x2,sp,#96 add x0,sp,#0 bl __sub_mod_384x384 // t0 = t0-t1 add x1,sp,#0 add x0,x26,#0 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 add x1,sp,#192 add x0,x0,#48 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#288 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _sqr_mont_384x .private_extern _sqr_mont_384x .align 5 _sqr_mont_384x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#96 // space for 2 384-bit vectors mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 add x0,sp,#0 bl __add_mod_384 // t0 = a->re + a->im add x0,sp,#48 bl __sub_mod_384 // t1 = a->re - a->im ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) adds x11,x11,x11 // add with itself adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x19,x11,x19,lo csel x20,x12,x20,lo csel x21,x13,x21,lo ldp x11,x12,[sp] csel x22,x14,x22,lo ldr x17, [sp,#48] csel x23,x15,x23,lo ldp x13,x14,[sp,#16] csel x24,x16,x24,lo ldp x15,x16,[sp,#32] stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] add x2,sp,#48 bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _mul_mont_384 .private_extern _mul_mont_384 .align 5 _mul_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .align 5 __mul_mont_384: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 mov x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*1] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*2] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*3] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*4] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*5] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 // mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 // adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] // pull r_ptr adc x17,x17,xzr adds x19,x20,x26 adcs x20,x21,x27 adcs x21,x22,x28 adcs x22,x23,x0 adcs x23,x24,x1 adcs x24,x25,x3 adc x25,x17,xzr subs x26,x19,x5 sbcs x27,x20,x6 sbcs x28,x21,x7 sbcs x0,x22,x8 sbcs x1,x23,x9 sbcs x3,x24,x10 sbcs xzr, x25,xzr csel x11,x19,x26,lo csel x12,x20,x27,lo csel x13,x21,x28,lo csel x14,x22,x0,lo csel x15,x23,x1,lo csel x16,x24,x3,lo ret .globl _sqr_mont_384 .private_extern _sqr_mont_384 .align 5 _sqr_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 // space for 768-bit vector mov x4,x3 // adjust for missing b_ptr mov x3,x0 // save r_ptr mov x0,sp ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] mov x1,sp mov x0,x3 // restore r_ptr bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _sqr_n_mul_mont_383 .private_extern _sqr_n_mul_mont_383 .align 5 _sqr_n_mul_mont_383: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#96 // space for 768-bit vector mov x17,x5 // save b_ptr ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mov x0,sp Loop_sqr_383: bl __sqr_384 sub x2,x2,#1 // counter ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,sp bl __mul_by_1_mont_384 ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 // just accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 cbnz x2,Loop_sqr_383 mov x2,x17 ldr x17,[x17] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .align 5 __sqr_384: mul x19,x12,x11 mul x20,x13,x11 mul x21,x14,x11 mul x22,x15,x11 mul x23,x16,x11 umulh x6,x12,x11 umulh x7,x13,x11 umulh x8,x14,x11 umulh x9,x15,x11 adds x20,x20,x6 umulh x10,x16,x11 adcs x21,x21,x7 mul x7,x13,x12 adcs x22,x22,x8 mul x8,x14,x12 adcs x23,x23,x9 mul x9,x15,x12 adc x24,xzr, x10 mul x10,x16,x12 adds x21,x21,x7 umulh x7,x13,x12 adcs x22,x22,x8 umulh x8,x14,x12 adcs x23,x23,x9 umulh x9,x15,x12 adcs x24,x24,x10 umulh x10,x16,x12 adc x25,xzr,xzr mul x5,x11,x11 adds x22,x22,x7 umulh x11, x11,x11 adcs x23,x23,x8 mul x8,x14,x13 adcs x24,x24,x9 mul x9,x15,x13 adc x25,x25,x10 mul x10,x16,x13 adds x23,x23,x8 umulh x8,x14,x13 adcs x24,x24,x9 umulh x9,x15,x13 adcs x25,x25,x10 umulh x10,x16,x13 adc x26,xzr,xzr mul x6,x12,x12 adds x24,x24,x8 umulh x12, x12,x12 adcs x25,x25,x9 mul x9,x15,x14 adc x26,x26,x10 mul x10,x16,x14 adds x25,x25,x9 umulh x9,x15,x14 adcs x26,x26,x10 umulh x10,x16,x14 adc x27,xzr,xzr mul x7,x13,x13 adds x26,x26,x9 umulh x13, x13,x13 adc x27,x27,x10 mul x8,x14,x14 mul x10,x16,x15 umulh x14, x14,x14 adds x27,x27,x10 umulh x10,x16,x15 mul x9,x15,x15 adc x28,x10,xzr adds x19,x19,x19 adcs x20,x20,x20 adcs x21,x21,x21 adcs x22,x22,x22 adcs x23,x23,x23 adcs x24,x24,x24 adcs x25,x25,x25 adcs x26,x26,x26 umulh x15, x15,x15 adcs x27,x27,x27 mul x10,x16,x16 adcs x28,x28,x28 umulh x16, x16,x16 adc x1,xzr,xzr adds x19,x19,x11 adcs x20,x20,x6 adcs x21,x21,x12 adcs x22,x22,x7 adcs x23,x23,x13 adcs x24,x24,x8 adcs x25,x25,x14 stp x5,x19,[x0] adcs x26,x26,x9 stp x20,x21,[x0,#16] adcs x27,x27,x15 stp x22,x23,[x0,#32] adcs x28,x28,x10 stp x24,x25,[x0,#48] adc x16,x16,x1 stp x26,x27,[x0,#64] stp x28,x16,[x0,#80] ret .globl _sqr_384 .private_extern _sqr_384 .align 5 _sqr_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _redc_mont_384 .private_extern _redc_mont_384 .align 5 _redc_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _from_mont_384 .private_extern _from_mont_384 .align 5 _from_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 // adjust for missing b_ptr ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .align 5 __mul_by_1_mont_384: ldp x11,x12,[x1] ldp x13,x14,[x1,#16] mul x26,x4,x11 ldp x15,x16,[x1,#32] // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 // mul x19,x5,x26 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 // adds x19,x19,x11 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 ret .align 5 __redc_tail_mont_384: ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 // accumulate upper half adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csel x11,x11,x19,lo csel x12,x12,x20,lo csel x13,x13,x21,lo csel x14,x14,x22,lo csel x15,x15,x23,lo csel x16,x16,x24,lo stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret .globl _mul_384 .private_extern _mul_384 .align 5 _mul_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .align 5 __mul_384: ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 umulh x5,x11,x17 umulh x6,x12,x17 umulh x7,x13,x17 umulh x8,x14,x17 umulh x9,x15,x17 umulh x10,x16,x17 ldr x17,[x2,8*1] str x19,[x0] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,xzr, x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(1+1)] adc x25,xzr,xzr str x19,[x0,8*1] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(2+1)] adc x25,xzr,xzr str x19,[x0,8*2] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(3+1)] adc x25,xzr,xzr str x19,[x0,8*3] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(4+1)] adc x25,xzr,xzr str x19,[x0,8*4] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 adc x25,xzr,xzr str x19,[x0,8*5] adds x19,x20,x5 adcs x20,x21,x6 adcs x21,x22,x7 adcs x22,x23,x8 adcs x23,x24,x9 adc x24,x25,x10 stp x19,x20,[x0,#48] stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ret .globl _mul_382x .private_extern _mul_382x .align 5 _mul_382x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 // space for two 384-bit vectors ldp x11,x12,[x1] mov x26,x0 // save r_ptr ldp x19,x20,[x1,#48] mov x27,x1 // save a_ptr ldp x13,x14,[x1,#16] mov x28,x2 // save b_ptr ldp x21,x22,[x1,#64] ldp x15,x16,[x1,#32] adds x5,x11,x19 // t0 = a->re + a->im ldp x23,x24,[x1,#80] adcs x6,x12,x20 ldp x11,x12,[x2] adcs x7,x13,x21 ldp x19,x20,[x2,#48] adcs x8,x14,x22 ldp x13,x14,[x2,#16] adcs x9,x15,x23 ldp x21,x22,[x2,#64] adc x10,x16,x24 ldp x15,x16,[x2,#32] stp x5,x6,[sp] adds x5,x11,x19 // t1 = b->re + b->im ldp x23,x24,[x2,#80] adcs x6,x12,x20 stp x7,x8,[sp,#16] adcs x7,x13,x21 adcs x8,x14,x22 stp x9,x10,[sp,#32] adcs x9,x15,x23 stp x5,x6,[sp,#48] adc x10,x16,x24 stp x7,x8,[sp,#64] stp x9,x10,[sp,#80] bl __mul_384 // _mul_384(ret->re, a->re, b->re) add x1,sp,#0 add x2,sp,#48 add x0,x26,#96 bl __mul_384 add x1,x27,#48 add x2,x28,#48 add x0,sp,#0 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] add x1,x26,#96 add x2,sp,#0 add x0,x26,#96 bl __sub_mod_384x384 add x2,x26,#0 bl __sub_mod_384x384 add x1,x26,#0 add x2,sp,#0 add x0,x26,#0 bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _sqr_382x .private_extern _sqr_382x .align 5 _sqr_382x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x19,x20,[x1,#48] ldp x13,x14,[x1,#16] adds x5,x11,x19 // t0 = a->re + a->im ldp x21,x22,[x1,#64] adcs x6,x12,x20 ldp x15,x16,[x1,#32] adcs x7,x13,x21 ldp x23,x24,[x1,#80] adcs x8,x14,x22 stp x5,x6,[x0] adcs x9,x15,x23 ldp x5,x6,[x2] adc x10,x16,x24 stp x7,x8,[x0,#16] subs x11,x11,x19 // t1 = a->re - a->im ldp x7,x8,[x2,#16] sbcs x12,x12,x20 stp x9,x10,[x0,#32] sbcs x13,x13,x21 ldp x9,x10,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 adds x11,x11,x19 and x21,x7,x25 adcs x12,x12,x20 and x22,x8,x25 adcs x13,x13,x21 and x23,x9,x25 adcs x14,x14,x22 and x24,x10,x25 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] mov x4,x1 // save a_ptr add x1,x0,#0 add x2,x0,#48 bl __mul_384 add x1,x4,#0 add x2,x4,#48 add x0,x0,#96 bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x11,x12,[x0] ldp x13,x14,[x0,#16] adds x11,x11,x11 // add with itself ldp x15,x16,[x0,#32] adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adcs x19,x19,x19 adcs x20,x20,x20 stp x11,x12,[x0] adcs x21,x21,x21 stp x13,x14,[x0,#16] adcs x22,x22,x22 stp x15,x16,[x0,#32] adcs x23,x23,x23 stp x19,x20,[x0,#48] adc x24,x24,x24 stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _sqr_mont_382x .private_extern _sqr_mont_382x .align 5 _sqr_mont_382x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub sp,sp,#112 // space for two 384-bit vectors + word mov x4,x3 // adjust for missing b_ptr ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x17,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x5,x11,x17 // t0 = a->re + a->im adcs x6,x12,x20 adcs x7,x13,x21 adcs x8,x14,x22 adcs x9,x15,x23 adc x10,x16,x24 subs x19,x11,x17 // t1 = a->re - a->im sbcs x20,x12,x20 sbcs x21,x13,x21 sbcs x22,x14,x22 sbcs x23,x15,x23 sbcs x24,x16,x24 sbc x25,xzr,xzr // borrow flag as mask stp x5,x6,[sp] stp x7,x8,[sp,#16] stp x9,x10,[sp,#32] stp x19,x20,[sp,#48] stp x21,x22,[sp,#64] stp x23,x24,[sp,#80] str x25,[sp,#96] ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 bl __mul_mont_383_nonred // _mul_mont_384(ret->im, a->re, a->im) adds x19,x11,x11 // add with itself adcs x20,x12,x12 adcs x21,x13,x13 adcs x22,x14,x14 adcs x23,x15,x15 adc x24,x16,x16 stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] ldp x11,x12,[sp] ldr x17,[sp,#48] ldp x13,x14,[sp,#16] ldp x15,x16,[sp,#32] add x2,sp,#48 bl __mul_mont_383_nonred // _mul_mont_384(ret->im, t0, t1) ldr x30,[x29,#__SIZEOF_POINTER__] ldr x25,[sp,#96] // account for sign from a->re - a->im ldp x19,x20,[sp] ldp x21,x22,[sp,#16] ldp x23,x24,[sp,#32] and x19,x19,x25 and x20,x20,x25 and x21,x21,x25 and x22,x22,x25 and x23,x23,x25 and x24,x24,x25 subs x11,x11,x19 sbcs x12,x12,x20 sbcs x13,x13,x21 sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 and x21,x7,x25 and x22,x8,x25 and x23,x9,x25 and x24,x10,x25 adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#112 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .align 5 __mul_mont_383_nonred: mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 ldr x17,[x2,8*1] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*2] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*3] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*4] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*5] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] // pull r_ptr adds x11,x20,x26 adcs x12,x21,x27 adcs x13,x22,x28 adcs x14,x23,x0 adcs x15,x24,x1 adcs x16,x25,x3 ret .globl _sgn0_pty_mont_384 .private_extern _sgn0_pty_mont_384 .align 5 _sgn0_pty_mont_384: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 adds x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret .globl _sgn0_pty_mont_384x .private_extern _sgn0_pty_mont_384x .align 5 _sgn0_pty_mont_384x: hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 add x1,x1,#48 and x2,x11,#1 orr x3,x11,x12 adds x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 orr x3,x3,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x2,x2,x17 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 orr x1,x11,x12 adds x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 orr x1,x1,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 cmp x3,#0 csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) cmp x1,#0 csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 // pack sign and parity ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ================================================ FILE: build/mach-o/mulq_mont_256-x86_64.s ================================================ .comm ___blst_platform_cap,4 .text .globl _mul_mont_sparse_256 .private_extern _mul_mont_sparse_256 .p2align 5 _mul_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$mul_mont_sparse_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 movq 0(%rdx),%rax movq 0(%rsi),%r13 movq 8(%rsi),%r14 movq 16(%rsi),%r12 movq 24(%rsi),%rbp movq %rdx,%rbx movq %rax,%r15 mulq %r13 movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 call __mulq_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_mont_sparse_256 .private_extern _sqr_mont_sparse_256 .p2align 5 _sqr_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_mont_sparse_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 movq 0(%rsi),%rax movq %rcx,%r8 movq 8(%rsi),%r14 movq %rdx,%rcx movq 16(%rsi),%r12 leaq (%rsi),%rbx movq 24(%rsi),%rbp movq %rax,%r15 mulq %rax movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 call __mulq_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulq_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulq %r14 addq %rax,%r10 movq %r15,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r12 addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq 8(%rbx),%rax adcq $0,%rdx xorq %r14,%r14 movq %rdx,%r13 movq %r9,%rdi imulq %r8,%r9 movq %rax,%r15 mulq 0(%rsi) addq %rax,%r10 movq %r15,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r11 movq %r15,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r12 movq %r15,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq %rdx,%r14 xorq %r15,%r15 mulq 0(%rcx) addq %rax,%rdi movq %r9,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %rdi,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r12 movq 16(%rbx),%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx addq %rdx,%r13 adcq $0,%r14 adcq $0,%r15 movq %r10,%rdi imulq %r8,%r10 movq %rax,%r9 mulq 0(%rsi) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r12 movq %r9,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq %rdx,%r15 xorq %r9,%r9 mulq 0(%rcx) addq %rax,%rdi movq %r10,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %rdi,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r13 movq 24(%rbx),%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx addq %rdx,%r14 adcq $0,%r15 adcq $0,%r9 movq %r11,%rdi imulq %r8,%r11 movq %rax,%r10 mulq 0(%rsi) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rax,%r13 movq %r10,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rsi) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %rbp,%r15 adcq %rdx,%r9 xorq %r10,%r10 mulq 0(%rcx) addq %rax,%rdi movq %r11,%rax adcq %rdx,%rdi mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %rdi,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx addq %rdx,%r15 adcq $0,%r9 adcq $0,%r10 imulq %r8,%rax movq 8(%rsp),%rsi movq %rax,%r11 mulq 0(%rcx) addq %rax,%r12 movq %r11,%rax adcq %rdx,%r12 mulq 8(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r12,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r14 movq %r11,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) movq %r14,%rbx addq %rbp,%r15 adcq $0,%rdx addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %rdx,%r9 adcq $0,%r10 movq %r15,%r12 subq 0(%rcx),%r13 sbbq 8(%rcx),%r14 sbbq 16(%rcx),%r15 movq %r9,%rbp sbbq 24(%rcx),%r9 sbbq $0,%r10 cmovcq %rax,%r13 cmovcq %rbx,%r14 cmovcq %r12,%r15 movq %r13,0(%rsi) cmovcq %rbp,%r9 movq %r14,8(%rsi) movq %r15,16(%rsi) movq %r9,24(%rsi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _from_mont_256 .private_extern _from_mont_256 .p2align 5 _from_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$from_mont_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_256 movq %r14,%r10 movq %r15,%r11 movq %r9,%r12 subq 0(%rbx),%r13 sbbq 8(%rbx),%r14 sbbq 16(%rbx),%r15 sbbq 24(%rbx),%r9 cmovncq %r13,%rax cmovncq %r14,%r10 cmovncq %r15,%r11 movq %rax,0(%rdi) cmovncq %r9,%r12 movq %r10,8(%rdi) movq %r11,16(%rdi) movq %r12,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _redc_mont_256 .private_extern _redc_mont_256 .p2align 5 _redc_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$redc_mont_256$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_256 addq 32(%rsi),%r13 adcq 40(%rsi),%r14 movq %r13,%rax adcq 48(%rsi),%r15 movq %r14,%r10 adcq 56(%rsi),%r9 sbbq %rsi,%rsi movq %r15,%r11 subq 0(%rbx),%r13 sbbq 8(%rbx),%r14 sbbq 16(%rbx),%r15 movq %r9,%r12 sbbq 24(%rbx),%r9 sbbq $0,%rsi cmovncq %r13,%rax cmovncq %r14,%r10 cmovncq %r15,%r11 movq %rax,0(%rdi) cmovncq %r9,%r12 movq %r10,8(%rdi) movq %r11,16(%rdi) movq %r12,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulq_by_1_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 movq %rax,%r13 imulq %rcx,%rax movq %rax,%r9 mulq 0(%rbx) addq %rax,%r13 movq %r9,%rax adcq %rdx,%r13 mulq 8(%rbx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r13,%r10 adcq $0,%rdx movq %rdx,%r13 mulq 16(%rbx) movq %r10,%r14 imulq %rcx,%r10 addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r13,%r11 adcq $0,%rdx movq %rdx,%r13 mulq 24(%rbx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r13,%r12 adcq $0,%rdx movq %rdx,%r13 mulq 0(%rbx) addq %rax,%r14 movq %r10,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) movq %r11,%r15 imulq %rcx,%r11 addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r11,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) movq %r12,%r9 imulq %rcx,%r12 addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r9 movq %r12,%rax adcq %rdx,%r9 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/mulq_mont_384-x86_64.s ================================================ .comm ___blst_platform_cap,4 .text .p2align 5 __subq_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __addq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __subq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __subq_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_mont_384x .private_extern _mul_mont_384x .p2align 5 _mul_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$mul_mont_384x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $328,%rsp .cfi_adjust_cfa_offset 328 movq %rdx,%rbx movq %rdi,32(%rsp) movq %rsi,24(%rsp) movq %rdx,16(%rsp) movq %rcx,8(%rsp) movq %r8,0(%rsp) leaq 40(%rsp),%rdi call __mulq_384 leaq 48(%rbx),%rbx leaq 48(%rsi),%rsi leaq 40+96(%rsp),%rdi call __mulq_384 movq 8(%rsp),%rcx leaq -48(%rsi),%rdx leaq 40+192+48(%rsp),%rdi call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi call __addq_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi call __mulq_384 leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx call __subq_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx call __subq_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi call __subq_mod_384x384 movq %rcx,%rbx leaq 40(%rsp),%rsi movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -328-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_mont_384x .private_extern _sqr_mont_384x .p2align 5 _sqr_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_mont_384x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,8(%rsp) movq %rsi,16(%rsp) leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi call __subq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rbx movq 48(%rsi),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 call __mulq_mont_384 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 movq %r14,%r12 adcq %r9,%r9 movq %r15,%r13 adcq %r10,%r10 movq %r8,%rax adcq %r11,%r11 movq %r9,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %r10,%rbp sbbq 16(%rcx),%r8 sbbq 24(%rcx),%r9 sbbq 32(%rcx),%r10 movq %r11,%rsi sbbq 40(%rcx),%r11 sbbq $0,%rdx cmovcq %r12,%r14 cmovcq %r13,%r15 cmovcq %rax,%r8 movq %r14,48(%rdi) cmovcq %rbx,%r9 movq %r15,56(%rdi) cmovcq %rbp,%r10 movq %r8,64(%rdi) cmovcq %rsi,%r11 movq %r9,72(%rdi) movq %r10,80(%rdi) movq %r11,88(%rdi) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rax movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%r12 movq 32+24(%rsp),%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_382x .private_extern _mul_382x .p2align 5 _mul_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$mul_382x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 leaq 96(%rdi),%rdi movq %rsi,0(%rsp) movq %rdx,8(%rsp) movq %rdi,16(%rsp) movq %rcx,24(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 48(%rsi),%r8 adcq 56(%rsi),%r9 adcq 64(%rsi),%r10 adcq 72(%rsi),%r11 adcq 80(%rsi),%r12 adcq 88(%rsi),%r13 movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq 32(%rdx),%r12 movq 40(%rdx),%r13 addq 48(%rdx),%r8 adcq 56(%rdx),%r9 adcq 64(%rdx),%r10 adcq 72(%rdx),%r11 adcq 80(%rdx),%r12 adcq 88(%rdx),%r13 movq %r8,32+48(%rsp) movq %r9,32+56(%rsp) movq %r10,32+64(%rsp) movq %r11,32+72(%rsp) movq %r12,32+80(%rsp) movq %r13,32+88(%rsp) leaq 32+0(%rsp),%rsi leaq 32+48(%rsp),%rbx call __mulq_384 movq 0(%rsp),%rsi movq 8(%rsp),%rbx leaq -96(%rdi),%rdi call __mulq_384 leaq 48(%rsi),%rsi leaq 48(%rbx),%rbx leaq 32(%rsp),%rdi call __mulq_384 movq 16(%rsp),%rsi leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi call __subq_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx call __subq_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi call __subq_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_382x .private_extern _sqr_382x .p2align 5 _sqr_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_382x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 movq %rdx,%rcx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%rbx movq 32(%rsi),%rbp movq 40(%rsi),%rdx movq %r14,%r8 addq 48(%rsi),%r14 movq %r15,%r9 adcq 56(%rsi),%r15 movq %rax,%r10 adcq 64(%rsi),%rax movq %rbx,%r11 adcq 72(%rsi),%rbx movq %rbp,%r12 adcq 80(%rsi),%rbp movq %rdx,%r13 adcq 88(%rsi),%rdx movq %r14,0(%rdi) movq %r15,8(%rdi) movq %rax,16(%rdi) movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rdx,40(%rdi) leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi call __subq_mod_384_a_is_loaded leaq (%rdi),%rsi leaq -48(%rdi),%rbx leaq -48(%rdi),%rdi call __mulq_384 movq (%rsp),%rsi leaq 48(%rsi),%rbx leaq 96(%rdi),%rdi call __mulq_384 movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq 64(%rdi),%rax movq 72(%rdi),%rbx movq 80(%rdi),%rbp addq %r8,%r8 movq 88(%rdi),%rdx adcq %r9,%r9 movq %r8,0(%rdi) adcq %r10,%r10 movq %r9,8(%rdi) adcq %r11,%r11 movq %r10,16(%rdi) adcq %r12,%r12 movq %r11,24(%rdi) adcq %r13,%r13 movq %r12,32(%rdi) adcq %r14,%r14 movq %r13,40(%rdi) adcq %r15,%r15 movq %r14,48(%rdi) adcq %rax,%rax movq %r15,56(%rdi) adcq %rbx,%rbx movq %rax,64(%rdi) adcq %rbp,%rbp movq %rbx,72(%rdi) adcq %rdx,%rdx movq %rbp,80(%rdi) movq %rdx,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -8*7 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_384 .private_extern _mul_384 .p2align 5 _mul_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$mul_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 movq %rdx,%rbx call __mulq_384 movq 0(%rsp),%r12 .cfi_restore %r12 movq 8(%rsp),%rbx .cfi_restore %rbx movq 16(%rsp),%rbp .cfi_restore %rbp leaq 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulq_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rbx),%rax movq %rax,%rbp mulq 0(%rsi) movq %rax,0(%rdi) movq %rbp,%rax movq %rdx,%rcx mulq 8(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r11 movq 8(%rbx),%rax adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,8(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 16(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,16(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 24(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,24(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 32(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,32(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq 40(%rbx),%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rax,%rbp mulq 0(%rsi) addq %rax,%rcx movq %rbp,%rax adcq $0,%rdx movq %rcx,40(%rdi) movq %rdx,%rcx mulq 8(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %rax,%r12 movq %rax,%rax adcq $0,%rdx addq %r12,%r11 adcq $0,%rdx movq %rdx,%r12 movq %rcx,48(%rdi) movq %r8,56(%rdi) movq %r9,64(%rdi) movq %r10,72(%rdi) movq %r11,80(%rdi) movq %r12,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_384 .private_extern _sqr_384 .p2align 5 _sqr_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 call __sqrq_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __sqrq_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r15 movq 16(%rsi),%rcx movq 24(%rsi),%rbx movq %rax,%r14 mulq %r15 movq %rax,%r9 movq %r14,%rax movq 32(%rsi),%rbp movq %rdx,%r10 mulq %rcx addq %rax,%r10 movq %r14,%rax adcq $0,%rdx movq 40(%rsi),%rsi movq %rdx,%r11 mulq %rbx addq %rax,%r11 movq %r14,%rax adcq $0,%rdx movq %rdx,%r12 mulq %rbp addq %rax,%r12 movq %r14,%rax adcq $0,%rdx movq %rdx,%r13 mulq %rsi addq %rax,%r13 movq %r14,%rax adcq $0,%rdx movq %rdx,%r14 mulq %rax xorq %r8,%r8 movq %rax,0(%rdi) movq %r15,%rax addq %r9,%r9 adcq $0,%r8 addq %rdx,%r9 adcq $0,%r8 movq %r9,8(%rdi) mulq %rcx addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r9 mulq %rbx addq %rax,%r12 movq %r15,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq %rbp addq %rax,%r13 movq %r15,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq %rsi addq %rax,%r14 movq %r15,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r15 mulq %rax xorq %r9,%r9 addq %rax,%r8 movq %rcx,%rax addq %r10,%r10 adcq %r11,%r11 adcq $0,%r9 addq %r8,%r10 adcq %rdx,%r11 adcq $0,%r9 movq %r10,16(%rdi) mulq %rbx addq %rax,%r13 movq %rcx,%rax adcq $0,%rdx movq %r11,24(%rdi) movq %rdx,%r8 mulq %rbp addq %rax,%r14 movq %rcx,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq %rsi addq %rax,%r15 movq %rcx,%rax adcq $0,%rdx addq %r8,%r15 adcq $0,%rdx movq %rdx,%rcx mulq %rax xorq %r11,%r11 addq %rax,%r9 movq %rbx,%rax addq %r12,%r12 adcq %r13,%r13 adcq $0,%r11 addq %r9,%r12 adcq %rdx,%r13 adcq $0,%r11 movq %r12,32(%rdi) mulq %rbp addq %rax,%r15 movq %rbx,%rax adcq $0,%rdx movq %r13,40(%rdi) movq %rdx,%r8 mulq %rsi addq %rax,%rcx movq %rbx,%rax adcq $0,%rdx addq %r8,%rcx adcq $0,%rdx movq %rdx,%rbx mulq %rax xorq %r12,%r12 addq %rax,%r11 movq %rbp,%rax addq %r14,%r14 adcq %r15,%r15 adcq $0,%r12 addq %r11,%r14 adcq %rdx,%r15 movq %r14,48(%rdi) adcq $0,%r12 movq %r15,56(%rdi) mulq %rsi addq %rax,%rbx movq %rbp,%rax adcq $0,%rdx movq %rdx,%rbp mulq %rax xorq %r13,%r13 addq %rax,%r12 movq %rsi,%rax addq %rcx,%rcx adcq %rbx,%rbx adcq $0,%r13 addq %r12,%rcx adcq %rdx,%rbx movq %rcx,64(%rdi) adcq $0,%r13 movq %rbx,72(%rdi) mulq %rax addq %r13,%rax addq %rbp,%rbp adcq $0,%rdx addq %rbp,%rax adcq $0,%rdx movq %rax,80(%rdi) movq %rdx,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_mont_384 .private_extern _sqr_mont_384 .p2align 5 _sqr_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $120,%rsp .cfi_adjust_cfa_offset 8*15 movq %rcx,96(%rsp) movq %rdx,104(%rsp) movq %rdi,112(%rsp) movq %rsp,%rdi call __sqrq_384 leaq 0(%rsp),%rsi movq 96(%rsp),%rcx movq 104(%rsp),%rbx movq 112(%rsp),%rdi call __mulq_by_1_mont_384 call __redq_tail_mont_384 leaq 120(%rsp),%r8 movq 120(%rsp),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -8*21 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _redc_mont_384 .private_extern _redc_mont_384 .p2align 5 _redc_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$redc_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_384 call __redq_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _from_mont_384 .private_extern _from_mont_384 .p2align 5 _from_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$from_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulq_by_1_mont_384 movq %r15,%rcx movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulq_by_1_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rax movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %rax,%r14 imulq %rcx,%rax movq %rax,%r8 mulq 0(%rbx) addq %rax,%r14 movq %r8,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %r14,%r9 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %r14,%r10 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %r9,%r15 imulq %rcx,%r9 addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 32(%rbx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 40(%rbx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r9,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r15,%r11 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %r10,%r8 imulq %rcx,%r10 addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 32(%rbx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rbx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r8 movq %r10,%rax adcq %rdx,%r8 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rbx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 mulq 24(%rbx) addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %r11,%r9 imulq %rcx,%r11 addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rbx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rbx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r8,%r15 adcq $0,%rdx movq %rdx,%r8 mulq 0(%rbx) addq %rax,%r9 movq %r11,%rax adcq %rdx,%r9 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rbx) addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %r12,%r10 imulq %rcx,%r12 addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rbx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rbx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r9,%r8 adcq $0,%rdx movq %rdx,%r9 mulq 0(%rbx) addq %rax,%r10 movq %r12,%rax adcq %rdx,%r10 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rbx) addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %r13,%r11 imulq %rcx,%r13 addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rbx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r10,%r8 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rbx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r10,%r9 adcq $0,%rdx movq %rdx,%r10 mulq 0(%rbx) addq %rax,%r11 movq %r13,%rax adcq %rdx,%r11 mulq 8(%rbx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rbx) addq %rax,%r8 movq %r13,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rbx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r11,%r9 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rbx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __redq_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 movq %r14,%rax adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 movq %r15,%rcx adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 sbbq %r12,%r12 movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sgn0_pty_mont_384 .private_extern _sgn0_pty_mont_384 .p2align 5 _sgn0_pty_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sgn0_pty_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 0(%rdi),%rsi movq %rdx,%rcx call __mulq_by_1_mont_384 xorq %rax,%rax movq %r14,%r13 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax notq %rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sgn0_pty_mont_384x .private_extern _sgn0_pty_mont_384x .p2align 5 _sgn0_pty_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sgn0_pty_mont_384x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 48(%rdi),%rsi movq %rdx,%rcx call __mulq_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 leaq 0(%rdi),%rsi xorq %rdi,%rdi movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rdi subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rdi movq %r14,0(%rsp) notq %rdi andq $1,%r13 andq $2,%rdi orq %r13,%rdi call __mulq_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 xorq %rax,%rax movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax movq 0(%rsp),%r12 notq %rax testq %r14,%r14 cmovzq %rdi,%r13 testq %r12,%r12 cmovnzq %rdi,%rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mul_mont_384 .private_extern _mul_mont_384 .p2align 5 _mul_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$mul_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $24,%rsp .cfi_adjust_cfa_offset 8*3 movq 0(%rdx),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq %rdx,%rbx movq %r8,0(%rsp) movq %rdi,8(%rsp) call __mulq_mont_384 movq 24(%rsp),%r15 .cfi_restore %r15 movq 32(%rsp),%r14 .cfi_restore %r14 movq 40(%rsp),%r13 .cfi_restore %r13 movq 48(%rsp),%r12 .cfi_restore %r12 movq 56(%rsp),%rbx .cfi_restore %rbx movq 64(%rsp),%rbp .cfi_restore %rbp leaq 72(%rsp),%rsp .cfi_adjust_cfa_offset -72 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulq_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rax,%rdi mulq %r14 movq %rax,%r8 movq %rdi,%rax movq %rdx,%r9 mulq %r15 addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r12 addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r11 movq %r8,%rbp imulq 8(%rsp),%r8 mulq %r13 addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r13 mulq 40(%rsi) addq %rax,%r13 movq %r8,%rax adcq $0,%rdx xorq %r15,%r15 movq %rdx,%r14 mulq 0(%rcx) addq %rax,%rbp movq %r8,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %rbp,%r9 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %rbp,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r13 movq 8(%rbx),%rax adcq $0,%rdx addq %rbp,%r13 adcq %rdx,%r14 adcq $0,%r15 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r8 mulq 8(%rsi) addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx addq %r8,%r10 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 movq %r9,%rbp imulq 8(%rsp),%r9 mulq 24(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rsi) addq %r8,%r14 adcq $0,%rdx xorq %r8,%r8 addq %rax,%r14 movq %r9,%rax adcq %rdx,%r15 adcq $0,%r8 mulq 0(%rcx) addq %rax,%rbp movq %r9,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %rbp,%r10 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r14 movq 16(%rbx),%rax adcq $0,%rdx addq %rbp,%r14 adcq %rdx,%r15 adcq $0,%r8 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r10 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx addq %r9,%r11 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 movq %r10,%rbp imulq 8(%rsp),%r10 mulq 24(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rsi) addq %r9,%r15 adcq $0,%rdx xorq %r9,%r9 addq %rax,%r15 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 mulq 0(%rcx) addq %rax,%rbp movq %r10,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %rbp,%r11 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r15 movq 24(%rbx),%rax adcq $0,%rdx addq %rbp,%r15 adcq %rdx,%r8 adcq $0,%r9 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r11 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r10 mulq 8(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx addq %r10,%r12 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 movq %r11,%rbp imulq 8(%rsp),%r11 mulq 24(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rsi) addq %r10,%r8 adcq $0,%rdx xorq %r10,%r10 addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 mulq 0(%rcx) addq %rax,%rbp movq %r11,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %rbp,%r12 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r14 adcq $0,%rdx addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %rbp,%r15 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r8 movq 32(%rbx),%rax adcq $0,%rdx addq %rbp,%r8 adcq %rdx,%r9 adcq $0,%r10 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r12 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r11 mulq 8(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 movq %r12,%rbp imulq 8(%rsp),%r12 mulq 24(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rsi) addq %rax,%r8 movq %rdi,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %r11,%r9 adcq $0,%rdx xorq %r11,%r11 addq %rax,%r9 movq %r12,%rax adcq %rdx,%r10 adcq $0,%r11 mulq 0(%rcx) addq %rax,%rbp movq %r12,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %rbp,%r13 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r15 adcq $0,%rdx addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %rbp,%r8 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r9 movq 40(%rbx),%rax adcq $0,%rdx addq %rbp,%r9 adcq %rdx,%r10 adcq $0,%r11 movq %rax,%rdi mulq 0(%rsi) addq %rax,%r13 movq %rdi,%rax adcq $0,%rdx movq %rdx,%r12 mulq 8(%rsi) addq %rax,%r14 movq %rdi,%rax adcq $0,%rdx addq %r12,%r14 adcq $0,%rdx movq %rdx,%r12 mulq 16(%rsi) addq %rax,%r15 movq %rdi,%rax adcq $0,%rdx addq %r12,%r15 adcq $0,%rdx movq %rdx,%r12 movq %r13,%rbp imulq 8(%rsp),%r13 mulq 24(%rsi) addq %rax,%r8 movq %rdi,%rax adcq $0,%rdx addq %r12,%r8 adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r9 movq %rdi,%rax adcq $0,%rdx addq %r12,%r9 adcq $0,%rdx movq %rdx,%r12 mulq 40(%rsi) addq %r12,%r10 adcq $0,%rdx xorq %r12,%r12 addq %rax,%r10 movq %r13,%rax adcq %rdx,%r11 adcq $0,%r12 mulq 0(%rcx) addq %rax,%rbp movq %r13,%rax adcq %rdx,%rbp mulq 8(%rcx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %rbp,%r14 adcq $0,%rdx movq %rdx,%rbp mulq 16(%rcx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %rbp,%r15 adcq $0,%rdx movq %rdx,%rbp mulq 24(%rcx) addq %rbp,%r8 adcq $0,%rdx addq %rax,%r8 movq %r13,%rax adcq $0,%rdx movq %rdx,%rbp mulq 32(%rcx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %rbp,%r9 adcq $0,%rdx movq %rdx,%rbp mulq 40(%rcx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %rbp,%r10 adcq %rdx,%r11 adcq $0,%r12 movq 16(%rsp),%rdi subq 0(%rcx),%r14 movq %r15,%rdx sbbq 8(%rcx),%r15 movq %r8,%rbx sbbq 16(%rcx),%r8 movq %r9,%rsi sbbq 24(%rcx),%r9 movq %r10,%rbp sbbq 32(%rcx),%r10 movq %r11,%r13 sbbq 40(%rcx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rdx,%r15 cmovcq %rbx,%r8 movq %r14,0(%rdi) cmovcq %rsi,%r9 movq %r15,8(%rdi) cmovcq %rbp,%r10 movq %r8,16(%rdi) cmovcq %r13,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_n_mul_mont_384 .private_extern _sqr_n_mul_mont_384 .p2align 5 _sqr_n_mul_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_n_mul_mont_384$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 8*17 movq %r8,0(%rsp) movq %rdi,8(%rsp) movq %rcx,16(%rsp) leaq 32(%rsp),%rdi movq %r9,24(%rsp) movq (%r9),%xmm2 L$oop_sqr_384: movd %edx,%xmm1 call __sqrq_384 leaq 0(%rdi),%rsi movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 call __redq_tail_mont_384 movd %xmm1,%edx leaq 0(%rdi),%rsi decl %edx jnz L$oop_sqr_384 .byte 102,72,15,126,208 movq %rbx,%rcx movq 24(%rsp),%rbx movq %r8,%r12 movq %r9,%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 136(%rsp),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -8*23 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_n_mul_mont_383 .private_extern _sqr_n_mul_mont_383 .p2align 5 _sqr_n_mul_mont_383: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_n_mul_mont_383$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 8*17 movq %r8,0(%rsp) movq %rdi,8(%rsp) movq %rcx,16(%rsp) leaq 32(%rsp),%rdi movq %r9,24(%rsp) movq (%r9),%xmm2 L$oop_sqr_383: movd %edx,%xmm1 call __sqrq_384 leaq 0(%rdi),%rsi movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 movd %xmm1,%edx addq 48(%rsi),%r14 adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 leaq 0(%rdi),%rsi movq %r14,0(%rdi) movq %r15,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) decl %edx jnz L$oop_sqr_383 .byte 102,72,15,126,208 movq %rbx,%rcx movq 24(%rsp),%rbx movq %r8,%r12 movq %r9,%r13 call __mulq_mont_384 leaq 136(%rsp),%r8 movq 136(%rsp),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -8*23 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulq_mont_383_nonred: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq %rax,%rbp mulq %r14 movq %rax,%r8 movq %rbp,%rax movq %rdx,%r9 mulq %r15 addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r12 addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 movq %r8,%r15 imulq 8(%rsp),%r8 mulq %r13 addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r13 mulq 40(%rsi) addq %rax,%r13 movq %r8,%rax adcq $0,%rdx movq %rdx,%r14 mulq 0(%rcx) addq %rax,%r15 movq %r8,%rax adcq %rdx,%r15 mulq 8(%rcx) addq %rax,%r9 movq %r8,%rax adcq $0,%rdx addq %r15,%r9 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rcx) addq %rax,%r10 movq %r8,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rcx) addq %r15,%r11 adcq $0,%rdx addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%r15 mulq 32(%rcx) addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rcx) addq %rax,%r13 movq 8(%rbx),%rax adcq $0,%rdx addq %r15,%r13 adcq %rdx,%r14 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r15 mulq 8(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx addq %r15,%r10 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r15,%r11 adcq $0,%rdx movq %rdx,%r15 movq %r9,%r8 imulq 8(%rsp),%r9 mulq 24(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 32(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 40(%rsi) addq %r15,%r14 adcq $0,%rdx addq %rax,%r14 movq %r9,%rax adcq $0,%rdx movq %rdx,%r15 mulq 0(%rcx) addq %rax,%r8 movq %r9,%rax adcq %rdx,%r8 mulq 8(%rcx) addq %rax,%r10 movq %r9,%rax adcq $0,%rdx addq %r8,%r10 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rcx) addq %rax,%r11 movq %r9,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 24(%rcx) addq %r8,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq $0,%rdx movq %rdx,%r8 mulq 32(%rcx) addq %rax,%r13 movq %r9,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rcx) addq %rax,%r14 movq 16(%rbx),%rax adcq $0,%rdx addq %r8,%r14 adcq %rdx,%r15 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r8 mulq 8(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx addq %r8,%r11 adcq $0,%rdx movq %rdx,%r8 mulq 16(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r8,%r12 adcq $0,%rdx movq %rdx,%r8 movq %r10,%r9 imulq 8(%rsp),%r10 mulq 24(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r8,%r13 adcq $0,%rdx movq %rdx,%r8 mulq 32(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r8,%r14 adcq $0,%rdx movq %rdx,%r8 mulq 40(%rsi) addq %r8,%r15 adcq $0,%rdx addq %rax,%r15 movq %r10,%rax adcq $0,%rdx movq %rdx,%r8 mulq 0(%rcx) addq %rax,%r9 movq %r10,%rax adcq %rdx,%r9 mulq 8(%rcx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r9,%r11 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rcx) addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 24(%rcx) addq %r9,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq $0,%rdx movq %rdx,%r9 mulq 32(%rcx) addq %rax,%r14 movq %r10,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rcx) addq %rax,%r15 movq 24(%rbx),%rax adcq $0,%rdx addq %r9,%r15 adcq %rdx,%r8 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx addq %r9,%r12 adcq $0,%rdx movq %rdx,%r9 mulq 16(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r9,%r13 adcq $0,%rdx movq %rdx,%r9 movq %r11,%r10 imulq 8(%rsp),%r11 mulq 24(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r9,%r14 adcq $0,%rdx movq %rdx,%r9 mulq 32(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r9,%r15 adcq $0,%rdx movq %rdx,%r9 mulq 40(%rsi) addq %r9,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq $0,%rdx movq %rdx,%r9 mulq 0(%rcx) addq %rax,%r10 movq %r11,%rax adcq %rdx,%r10 mulq 8(%rcx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r10,%r12 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rcx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rcx) addq %r10,%r14 adcq $0,%rdx addq %rax,%r14 movq %r11,%rax adcq $0,%rdx movq %rdx,%r10 mulq 32(%rcx) addq %rax,%r15 movq %r11,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rcx) addq %rax,%r8 movq 32(%rbx),%rax adcq $0,%rdx addq %r10,%r8 adcq %rdx,%r9 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq 8(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 movq %r12,%r11 imulq 8(%rsp),%r12 mulq 24(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 32(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r10,%r8 adcq $0,%rdx movq %rdx,%r10 mulq 40(%rsi) addq %r10,%r9 adcq $0,%rdx addq %rax,%r9 movq %r12,%rax adcq $0,%rdx movq %rdx,%r10 mulq 0(%rcx) addq %rax,%r11 movq %r12,%rax adcq %rdx,%r11 mulq 8(%rcx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rcx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rcx) addq %r11,%r15 adcq $0,%rdx addq %rax,%r15 movq %r12,%rax adcq $0,%rdx movq %rdx,%r11 mulq 32(%rcx) addq %rax,%r8 movq %r12,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rcx) addq %rax,%r9 movq 40(%rbx),%rax adcq $0,%rdx addq %r11,%r9 adcq %rdx,%r10 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq 8(%rsi) addq %rax,%r14 movq %rbp,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rsi) addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 movq %r13,%r12 imulq 8(%rsp),%r13 mulq 24(%rsi) addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %r11,%r8 adcq $0,%rdx movq %rdx,%r11 mulq 32(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx addq %r11,%r9 adcq $0,%rdx movq %rdx,%r11 mulq 40(%rsi) addq %r11,%r10 adcq $0,%rdx addq %rax,%r10 movq %r13,%rax adcq $0,%rdx movq %rdx,%r11 mulq 0(%rcx) addq %rax,%r12 movq %r13,%rax adcq %rdx,%r12 mulq 8(%rcx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r12,%r14 adcq $0,%rdx movq %rdx,%r12 mulq 16(%rcx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r12,%r15 adcq $0,%rdx movq %rdx,%r12 mulq 24(%rcx) addq %r12,%r8 adcq $0,%rdx addq %rax,%r8 movq %r13,%rax adcq $0,%rdx movq %rdx,%r12 mulq 32(%rcx) addq %rax,%r9 movq %r13,%rax adcq $0,%rdx addq %r12,%r9 adcq $0,%rdx movq %rdx,%r12 mulq 40(%rcx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r12,%r10 adcq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqr_mont_382x .private_extern _sqr_mont_382x .p2align 5 _sqr_mont_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __BLST_PORTABLE__ testl $1,___blst_platform_cap(%rip) jnz L$sqr_mont_382x$1 #endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rsi,16(%rsp) movq %rdi,24(%rsp) movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rdx adcq 72(%rsi),%r11 movq %r12,%rbx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rdx sbbq 80(%rsi),%rbx sbbq 88(%rsi),%rbp sbbq %rdi,%rdi movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq %r14,32+48(%rsp) movq %r15,32+56(%rsp) movq %rax,32+64(%rsp) movq %rdx,32+72(%rsp) movq %rbx,32+80(%rsp) movq %rbp,32+88(%rsp) movq %rdi,32+96(%rsp) leaq 48(%rsi),%rbx movq 48(%rsi),%rax movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq 24(%rsp),%rdi call __mulq_mont_383_nonred addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 movq %r14,48(%rdi) movq %r15,56(%rdi) movq %r8,64(%rdi) movq %r9,72(%rdi) movq %r10,80(%rdi) movq %r11,88(%rdi) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rax movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%r12 movq 32+24(%rsp),%r13 call __mulq_mont_383_nonred movq 32+96(%rsp),%rsi movq 32+0(%rsp),%r12 movq 32+8(%rsp),%r13 andq %rsi,%r12 movq 32+16(%rsp),%rax andq %rsi,%r13 movq 32+24(%rsp),%rbx andq %rsi,%rax movq 32+32(%rsp),%rbp andq %rsi,%rbx andq %rsi,%rbp andq 32+40(%rsp),%rsi subq %r12,%r14 movq 0(%rcx),%r12 sbbq %r13,%r15 movq 8(%rcx),%r13 sbbq %rax,%r8 movq 16(%rcx),%rax sbbq %rbx,%r9 movq 24(%rcx),%rbx sbbq %rbp,%r10 movq 32(%rcx),%rbp sbbq %rsi,%r11 sbbq %rsi,%rsi andq %rsi,%r12 andq %rsi,%r13 andq %rsi,%rax andq %rsi,%rbx andq %rsi,%rbp andq 40(%rcx),%rsi addq %r12,%r14 adcq %r13,%r15 adcq %rax,%r8 adcq %rbx,%r9 adcq %rbp,%r10 adcq %rsi,%r11 movq %r14,0(%rdi) movq %r15,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/mulx_mont_256-x86_64.s ================================================ .text .globl _mulx_mont_sparse_256 .private_extern _mulx_mont_sparse_256 .p2align 5 _mulx_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$mul_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rbp movq 24(%rsi),%r9 leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%rax,%r11 call __mulx_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_mont_sparse_256 .private_extern _sqrx_mont_sparse_256 .p2align 5 _sqrx_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx movq %rcx,%r8 movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rbp movq 24(%rsi),%r9 leaq -128(%rbx),%rsi leaq -128(%rcx),%rcx mulxq %rdx,%rax,%r11 call __mulx_mont_sparse_256 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulx_mont_sparse_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r15,%r12 mulxq %rbp,%rbp,%r13 addq %r15,%r11 mulxq %r9,%r9,%r14 movq 8(%rbx),%rdx adcq %rbp,%r12 adcq %r9,%r13 adcq $0,%r14 movq %rax,%r10 imulq %r8,%rax xorq %r15,%r15 mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r11 adcxq %r9,%r12 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r12 adcxq %r9,%r13 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r14 adcxq %r15,%r9 adoxq %r9,%r15 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r10 adoxq %r11,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r12 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r12 adoxq %r9,%r13 mulxq 24+128(%rcx),%rbp,%r9 movq 16(%rbx),%rdx adcxq %rbp,%r13 adoxq %r9,%r14 adcxq %r10,%r14 adoxq %r10,%r15 adcxq %r10,%r15 adoxq %r10,%r10 adcq $0,%r10 movq %rax,%r11 imulq %r8,%rax xorq %rbp,%rbp mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r12 adcxq %r9,%r13 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r14 adcxq %r9,%r15 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r15 adcxq %r10,%r9 adoxq %r9,%r10 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r11 adoxq %r12,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r13 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r13 adoxq %r9,%r14 mulxq 24+128(%rcx),%rbp,%r9 movq 24(%rbx),%rdx adcxq %rbp,%r14 adoxq %r9,%r15 adcxq %r11,%r15 adoxq %r11,%r10 adcxq %r11,%r10 adoxq %r11,%r11 adcq $0,%r11 movq %rax,%r12 imulq %r8,%rax xorq %rbp,%rbp mulxq 0+128(%rsi),%rbp,%r9 adoxq %rbp,%r13 adcxq %r9,%r14 mulxq 8+128(%rsi),%rbp,%r9 adoxq %rbp,%r14 adcxq %r9,%r15 mulxq 16+128(%rsi),%rbp,%r9 adoxq %rbp,%r15 adcxq %r9,%r10 mulxq 24+128(%rsi),%rbp,%r9 movq %rax,%rdx adoxq %rbp,%r10 adcxq %r11,%r9 adoxq %r9,%r11 mulxq 0+128(%rcx),%rbp,%rax adcxq %rbp,%r12 adoxq %r13,%rax mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%rax adoxq %r9,%r14 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r14 adoxq %r9,%r15 mulxq 24+128(%rcx),%rbp,%r9 movq %rax,%rdx adcxq %rbp,%r15 adoxq %r9,%r10 adcxq %r12,%r10 adoxq %r12,%r11 adcxq %r12,%r11 adoxq %r12,%r12 adcq $0,%r12 imulq %r8,%rdx xorq %rbp,%rbp mulxq 0+128(%rcx),%r13,%r9 adcxq %rax,%r13 adoxq %r9,%r14 mulxq 8+128(%rcx),%rbp,%r9 adcxq %rbp,%r14 adoxq %r9,%r15 mulxq 16+128(%rcx),%rbp,%r9 adcxq %rbp,%r15 adoxq %r9,%r10 mulxq 24+128(%rcx),%rbp,%r9 movq %r14,%rdx leaq 128(%rcx),%rcx adcxq %rbp,%r10 adoxq %r9,%r11 movq %r15,%rax adcxq %r13,%r11 adoxq %r13,%r12 adcq $0,%r12 movq %r10,%rbp subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 sbbq 16(%rcx),%r10 movq %r11,%r9 sbbq 24(%rcx),%r11 sbbq $0,%r12 cmovcq %rdx,%r14 cmovcq %rax,%r15 cmovcq %rbp,%r10 movq %r14,0(%rdi) cmovcq %r9,%r11 movq %r15,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _fromx_mont_256 .private_extern _fromx_mont_256 .p2align 5 _fromx_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$from_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulx_by_1_mont_256 movq %r15,%rdx movq %r10,%r12 movq %r11,%r13 subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r10 sbbq 24(%rbx),%r11 cmovncq %r14,%rax cmovncq %r15,%rdx cmovncq %r10,%r12 movq %rax,0(%rdi) cmovncq %r11,%r13 movq %rdx,8(%rdi) movq %r12,16(%rdi) movq %r13,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _redcx_mont_256 .private_extern _redcx_mont_256 .p2align 5 _redcx_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$redc_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx call __mulx_by_1_mont_256 addq 32(%rsi),%r14 adcq 40(%rsi),%r15 movq %r14,%rax adcq 48(%rsi),%r10 movq %r15,%rdx adcq 56(%rsi),%r11 sbbq %rsi,%rsi movq %r10,%r12 subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r10 movq %r11,%r13 sbbq 24(%rbx),%r11 sbbq $0,%rsi cmovncq %r14,%rax cmovncq %r15,%rdx cmovncq %r10,%r12 movq %rax,0(%rdi) cmovncq %r11,%r13 movq %rdx,8(%rdi) movq %r12,16(%rdi) movq %r13,24(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulx_by_1_mont_256: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rax movq 8(%rsi),%r11 movq 16(%rsi),%r12 movq 24(%rsi),%r13 movq %rax,%r14 imulq %rcx,%rax movq %rax,%r10 mulq 0(%rbx) addq %rax,%r14 movq %r10,%rax adcq %rdx,%r14 mulq 8(%rbx) addq %rax,%r11 movq %r10,%rax adcq $0,%rdx addq %r14,%r11 adcq $0,%rdx movq %rdx,%r14 mulq 16(%rbx) movq %r11,%r15 imulq %rcx,%r11 addq %rax,%r12 movq %r10,%rax adcq $0,%rdx addq %r14,%r12 adcq $0,%rdx movq %rdx,%r14 mulq 24(%rbx) addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r14,%r13 adcq $0,%rdx movq %rdx,%r14 mulq 0(%rbx) addq %rax,%r15 movq %r11,%rax adcq %rdx,%r15 mulq 8(%rbx) addq %rax,%r12 movq %r11,%rax adcq $0,%rdx addq %r15,%r12 adcq $0,%rdx movq %rdx,%r15 mulq 16(%rbx) movq %r12,%r10 imulq %rcx,%r12 addq %rax,%r13 movq %r11,%rax adcq $0,%rdx addq %r15,%r13 adcq $0,%rdx movq %rdx,%r15 mulq 24(%rbx) addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r15,%r14 adcq $0,%rdx movq %rdx,%r15 mulq 0(%rbx) addq %rax,%r10 movq %r12,%rax adcq %rdx,%r10 mulq 8(%rbx) addq %rax,%r13 movq %r12,%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdx,%r10 mulq 16(%rbx) movq %r13,%r11 imulq %rcx,%r13 addq %rax,%r14 movq %r12,%rax adcq $0,%rdx addq %r10,%r14 adcq $0,%rdx movq %rdx,%r10 mulq 24(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r10,%r15 adcq $0,%rdx movq %rdx,%r10 mulq 0(%rbx) addq %rax,%r11 movq %r13,%rax adcq %rdx,%r11 mulq 8(%rbx) addq %rax,%r14 movq %r13,%rax adcq $0,%rdx addq %r11,%r14 adcq $0,%rdx movq %rdx,%r11 mulq 16(%rbx) addq %rax,%r15 movq %r13,%rax adcq $0,%rdx addq %r11,%r15 adcq $0,%rdx movq %rdx,%r11 mulq 24(%rbx) addq %rax,%r10 movq %r14,%rax adcq $0,%rdx addq %r11,%r10 adcq $0,%rdx movq %rdx,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/mulx_mont_384-x86_64.s ================================================ .text .p2align 5 __subx_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 subq 0(%rdx),%r8 movq 56(%rsi),%r15 sbbq 8(%rdx),%r9 movq 64(%rsi),%rax sbbq 16(%rdx),%r10 movq 72(%rsi),%rbx sbbq 24(%rdx),%r11 movq 80(%rsi),%rbp sbbq 32(%rdx),%r12 movq 88(%rsi),%rsi sbbq 40(%rdx),%r13 movq %r8,0(%rdi) sbbq 48(%rdx),%r14 movq 0(%rcx),%r8 movq %r9,8(%rdi) sbbq 56(%rdx),%r15 movq 8(%rcx),%r9 movq %r10,16(%rdi) sbbq 64(%rdx),%rax movq 16(%rcx),%r10 movq %r11,24(%rdi) sbbq 72(%rdx),%rbx movq 24(%rcx),%r11 movq %r12,32(%rdi) sbbq 80(%rdx),%rbp movq 32(%rcx),%r12 movq %r13,40(%rdi) sbbq 88(%rdx),%rsi movq 40(%rcx),%r13 sbbq %rdx,%rdx andq %rdx,%r8 andq %rdx,%r9 andq %rdx,%r10 andq %rdx,%r11 andq %rdx,%r12 andq %rdx,%r13 addq %r8,%r14 adcq %r9,%r15 movq %r14,48(%rdi) adcq %r10,%rax movq %r15,56(%rdi) adcq %r11,%rbx movq %rax,64(%rdi) adcq %r12,%rbp movq %rbx,72(%rdi) adcq %r13,%rsi movq %rbp,80(%rdi) movq %rsi,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __addx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 0(%rdx),%r8 adcq 8(%rdx),%r9 adcq 16(%rdx),%r10 movq %r8,%r14 adcq 24(%rdx),%r11 movq %r9,%r15 adcq 32(%rdx),%r12 movq %r10,%rax adcq 40(%rdx),%r13 movq %r11,%rbx sbbq %rdx,%rdx subq 0(%rcx),%r8 sbbq 8(%rcx),%r9 movq %r12,%rbp sbbq 16(%rcx),%r10 sbbq 24(%rcx),%r11 sbbq 32(%rcx),%r12 movq %r13,%rsi sbbq 40(%rcx),%r13 sbbq $0,%rdx cmovcq %r14,%r8 cmovcq %r15,%r9 cmovcq %rax,%r10 movq %r8,0(%rdi) cmovcq %rbx,%r11 movq %r9,8(%rdi) cmovcq %rbp,%r12 movq %r10,16(%rdi) cmovcq %rsi,%r13 movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __subx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 __subx_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 movq 8(%rcx),%r15 sbbq 16(%rdx),%r10 movq 16(%rcx),%rax sbbq 24(%rdx),%r11 movq 24(%rcx),%rbx sbbq 32(%rdx),%r12 movq 32(%rcx),%rbp sbbq 40(%rdx),%r13 movq 40(%rcx),%rsi sbbq %rdx,%rdx andq %rdx,%r14 andq %rdx,%r15 andq %rdx,%rax andq %rdx,%rbx andq %rdx,%rbp andq %rdx,%rsi addq %r14,%r8 adcq %r15,%r9 movq %r8,0(%rdi) adcq %rax,%r10 movq %r9,8(%rdi) adcq %rbx,%r11 movq %r10,16(%rdi) adcq %rbp,%r12 movq %r11,24(%rdi) adcq %rsi,%r13 movq %r12,32(%rdi) movq %r13,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mulx_mont_384x .private_extern _mulx_mont_384x .p2align 5 _mulx_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$mul_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $328,%rsp .cfi_adjust_cfa_offset 328 movq %rdx,%rbx movq %rdi,32(%rsp) movq %rsi,24(%rsp) movq %rdx,16(%rsp) movq %rcx,8(%rsp) movq %r8,0(%rsp) leaq 40(%rsp),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 leaq 48(%rbx),%rbx leaq 128+48(%rsi),%rsi leaq 96(%rdi),%rdi call __mulx_384 movq 8(%rsp),%rcx leaq (%rbx),%rsi leaq -48(%rbx),%rdx leaq 40+192+48(%rsp),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi call __mulx_384 leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx call __subx_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi call __subx_mod_384x384 leaq (%rcx),%rbx leaq 40(%rsp),%rsi movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulx_by_1_mont_384 call __redx_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulx_by_1_mont_384 call __redx_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -328-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_mont_384x .private_extern _sqrx_mont_384x .p2align 5 _sqrx_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,16(%rsp) movq %rsi,24(%rsp) leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi call __subx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 48(%rsi),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 addq %rdx,%rdx adcq %r15,%r15 adcq %rax,%rax movq %rdx,%r8 adcq %r12,%r12 movq %r15,%r9 adcq %rdi,%rdi movq %rax,%r10 adcq %rbp,%rbp movq %r12,%r11 sbbq %rsi,%rsi subq 0(%rcx),%rdx sbbq 8(%rcx),%r15 movq %rdi,%r13 sbbq 16(%rcx),%rax sbbq 24(%rcx),%r12 sbbq 32(%rcx),%rdi movq %rbp,%r14 sbbq 40(%rcx),%rbp sbbq $0,%rsi cmovcq %r8,%rdx cmovcq %r9,%r15 cmovcq %r10,%rax movq %rdx,48(%rbx) cmovcq %r11,%r12 movq %r15,56(%rbx) cmovcq %r13,%rdi movq %rax,64(%rbx) cmovcq %r14,%rbp movq %r12,72(%rbx) movq %rdi,80(%rbx) movq %rbp,88(%rbx) leaq 32(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rdx movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%rax movq 32+24(%rsp),%r12 movq 32+32(%rsp),%rdi movq 32+40(%rsp),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mulx_382x .private_extern _mulx_382x .p2align 5 _mulx_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$mul_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 leaq 96(%rdi),%rdi movq %rsi,0(%rsp) movq %rdx,8(%rsp) movq %rdi,16(%rsp) movq %rcx,24(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 addq 48(%rsi),%r8 adcq 56(%rsi),%r9 adcq 64(%rsi),%r10 adcq 72(%rsi),%r11 adcq 80(%rsi),%r12 adcq 88(%rsi),%r13 movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq 0(%rdx),%r8 movq 8(%rdx),%r9 movq 16(%rdx),%r10 movq 24(%rdx),%r11 movq 32(%rdx),%r12 movq 40(%rdx),%r13 addq 48(%rdx),%r8 adcq 56(%rdx),%r9 adcq 64(%rdx),%r10 adcq 72(%rdx),%r11 adcq 80(%rdx),%r12 adcq 88(%rdx),%r13 movq %r8,32+48(%rsp) movq %r9,32+56(%rsp) movq %r10,32+64(%rsp) movq %r11,32+72(%rsp) movq %r12,32+80(%rsp) movq %r13,32+88(%rsp) leaq 32+0(%rsp),%rsi leaq 32+48(%rsp),%rbx call __mulx_384 movq 0(%rsp),%rsi movq 8(%rsp),%rbx leaq -96(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 leaq 48+128(%rsi),%rsi leaq 48(%rbx),%rbx leaq 32(%rsp),%rdi call __mulx_384 movq 16(%rsp),%rsi leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx call __subx_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi call __subx_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_382x .private_extern _sqrx_382x .p2align 5 _sqrx_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rsi .cfi_adjust_cfa_offset 8 movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%rbx movq 32(%rsi),%rbp movq 40(%rsi),%rdx movq %r14,%r8 addq 48(%rsi),%r14 movq %r15,%r9 adcq 56(%rsi),%r15 movq %rax,%r10 adcq 64(%rsi),%rax movq %rbx,%r11 adcq 72(%rsi),%rbx movq %rbp,%r12 adcq 80(%rsi),%rbp movq %rdx,%r13 adcq 88(%rsi),%rdx movq %r14,0(%rdi) movq %r15,8(%rdi) movq %rax,16(%rdi) movq %rbx,24(%rdi) movq %rbp,32(%rdi) movq %rdx,40(%rdi) leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi call __subx_mod_384_a_is_loaded leaq (%rdi),%rsi leaq -48(%rdi),%rbx leaq -48(%rdi),%rdi call __mulx_384 movq (%rsp),%rsi leaq 48(%rsi),%rbx leaq 96(%rdi),%rdi #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq 64(%rdi),%rax movq 72(%rdi),%rbx movq 80(%rdi),%rbp addq %r8,%r8 movq 88(%rdi),%rdx adcq %r9,%r9 movq %r8,0(%rdi) adcq %r10,%r10 movq %r9,8(%rdi) adcq %r11,%r11 movq %r10,16(%rdi) adcq %r12,%r12 movq %r11,24(%rdi) adcq %r13,%r13 movq %r12,32(%rdi) adcq %r14,%r14 movq %r13,40(%rdi) adcq %r15,%r15 movq %r14,48(%rdi) adcq %rax,%rax movq %r15,56(%rdi) adcq %rbx,%rbx movq %rax,64(%rdi) adcq %rbp,%rbp movq %rbx,72(%rdi) adcq %rdx,%rdx movq %rbp,80(%rdi) movq %rdx,88(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -8*7 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mulx_384 .private_extern _mulx_384 .p2align 5 _mulx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$mul_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rbx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 leaq -128(%rsi),%rsi mulxq %r14,%r9,%rcx xorq %rbp,%rbp mulxq %r15,%r8,%rax adcxq %rcx,%r8 movq %r9,0(%rdi) mulxq %r10,%r9,%rcx adcxq %rax,%r9 mulxq %r11,%r10,%rax adcxq %rcx,%r10 mulxq %r12,%r11,%rcx adcxq %rax,%r11 mulxq %r13,%r12,%r13 movq 8(%rbx),%rdx adcxq %rcx,%r12 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,8(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 16(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,16(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 24(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,24(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 32(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,32(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq 40(%rbx),%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 mulxq %r14,%rax,%rcx adcxq %r8,%rax adoxq %rcx,%r9 movq %rax,40(%rdi) mulxq %r15,%r8,%rcx adcxq %r9,%r8 adoxq %rcx,%r10 mulxq 128+16(%rsi),%r9,%rax adcxq %r10,%r9 adoxq %rax,%r11 mulxq 128+24(%rsi),%r10,%rcx adcxq %r11,%r10 adoxq %rcx,%r12 mulxq 128+32(%rsi),%r11,%rax adcxq %r12,%r11 adoxq %r13,%rax mulxq 128+40(%rsi),%r12,%r13 movq %rax,%rdx adcxq %rax,%r12 adoxq %rbp,%r13 adcxq %rbp,%r13 movq %r8,48(%rdi) movq %r9,56(%rdi) movq %r10,64(%rdi) movq %r11,72(%rdi) movq %r12,80(%rdi) movq %r13,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_384 .private_extern _sqrx_384 .p2align 5 _sqrx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 pushq %rdi .cfi_adjust_cfa_offset 8 #ifdef __SGX_LVI_HARDENING__ lfence #endif call __sqrx_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __sqrx_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%rdx movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%rcx movq 32(%rsi),%rbx mulxq %r14,%r8,%rdi movq 40(%rsi),%rbp mulxq %r15,%r9,%rax addq %rdi,%r9 mulxq %rcx,%r10,%rdi adcq %rax,%r10 mulxq %rbx,%r11,%rax adcq %rdi,%r11 mulxq %rbp,%r12,%r13 movq %r14,%rdx adcq %rax,%r12 adcq $0,%r13 xorq %r14,%r14 mulxq %r15,%rdi,%rax adcxq %rdi,%r10 adoxq %rax,%r11 mulxq %rcx,%rdi,%rax adcxq %rdi,%r11 adoxq %rax,%r12 mulxq %rbx,%rdi,%rax adcxq %rdi,%r12 adoxq %rax,%r13 mulxq %rbp,%rdi,%rax movq %r15,%rdx adcxq %rdi,%r13 adoxq %r14,%rax adcxq %rax,%r14 xorq %r15,%r15 mulxq %rcx,%rdi,%rax adcxq %rdi,%r12 adoxq %rax,%r13 mulxq %rbx,%rdi,%rax adcxq %rdi,%r13 adoxq %rax,%r14 mulxq %rbp,%rdi,%rax movq %rcx,%rdx adcxq %rdi,%r14 adoxq %r15,%rax adcxq %rax,%r15 xorq %rcx,%rcx mulxq %rbx,%rdi,%rax adcxq %rdi,%r14 adoxq %rax,%r15 mulxq %rbp,%rdi,%rax movq %rbx,%rdx adcxq %rdi,%r15 adoxq %rcx,%rax adcxq %rax,%rcx mulxq %rbp,%rdi,%rbx movq 0(%rsi),%rdx addq %rdi,%rcx movq 8(%rsp),%rdi adcq $0,%rbx xorq %rbp,%rbp adcxq %r8,%r8 adcxq %r9,%r9 adcxq %r10,%r10 adcxq %r11,%r11 adcxq %r12,%r12 mulxq %rdx,%rdx,%rax movq %rdx,0(%rdi) movq 8(%rsi),%rdx adoxq %rax,%r8 movq %r8,8(%rdi) mulxq %rdx,%r8,%rax movq 16(%rsi),%rdx adoxq %r8,%r9 adoxq %rax,%r10 movq %r9,16(%rdi) movq %r10,24(%rdi) mulxq %rdx,%r8,%r9 movq 24(%rsi),%rdx adoxq %r8,%r11 adoxq %r9,%r12 adcxq %r13,%r13 adcxq %r14,%r14 movq %r11,32(%rdi) movq %r12,40(%rdi) mulxq %rdx,%r8,%r9 movq 32(%rsi),%rdx adoxq %r8,%r13 adoxq %r9,%r14 adcxq %r15,%r15 adcxq %rcx,%rcx movq %r13,48(%rdi) movq %r14,56(%rdi) mulxq %rdx,%r8,%r9 movq 40(%rsi),%rdx adoxq %r8,%r15 adoxq %r9,%rcx adcxq %rbx,%rbx adcxq %rbp,%rbp movq %r15,64(%rdi) movq %rcx,72(%rdi) mulxq %rdx,%r8,%r9 adoxq %r8,%rbx adoxq %r9,%rbp movq %rbx,80(%rdi) movq %rbp,88(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _redcx_mont_384 .private_extern _redcx_mont_384 .p2align 5 _redcx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$redc_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 call __redx_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _fromx_mont_384 .private_extern _fromx_mont_384 .p2align 5 _fromx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$from_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 movq %r14,%rax movq %r15,%rcx movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulx_by_1_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq %rcx,%rdx movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 imulq %r8,%rdx xorq %r14,%r14 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r13 adoxq %r14,%rbp adcxq %rbp,%r14 imulq %r9,%rdx xorq %r15,%r15 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r14 adoxq %r15,%rbp adcxq %rbp,%r15 imulq %r10,%rdx xorq %r8,%r8 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r10 adoxq %rbp,%r11 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r15 adoxq %r8,%rbp adcxq %rbp,%r8 imulq %r11,%rdx xorq %r9,%r9 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r11 adoxq %rbp,%r12 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r8 adoxq %r9,%rbp adcxq %rbp,%r9 imulq %r12,%rdx xorq %r10,%r10 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r12 adoxq %rbp,%r13 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r9 adoxq %r10,%rbp adcxq %rbp,%r10 imulq %r13,%rdx xorq %r11,%r11 mulxq 0(%rbx),%rax,%rbp adcxq %rax,%r13 adoxq %rbp,%r14 mulxq 8(%rbx),%rax,%rbp adcxq %rax,%r14 adoxq %rbp,%r15 mulxq 16(%rbx),%rax,%rbp adcxq %rax,%r15 adoxq %rbp,%r8 mulxq 24(%rbx),%rax,%rbp adcxq %rax,%r8 adoxq %rbp,%r9 mulxq 32(%rbx),%rax,%rbp adcxq %rax,%r9 adoxq %rbp,%r10 mulxq 40(%rbx),%rax,%rbp movq %rcx,%rdx adcxq %rax,%r10 adoxq %r11,%rbp adcxq %rbp,%r11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __redx_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 movq %r14,%rax adcq 56(%rsi),%r15 adcq 64(%rsi),%r8 adcq 72(%rsi),%r9 movq %r15,%rcx adcq 80(%rsi),%r10 adcq 88(%rsi),%r11 sbbq %r12,%r12 movq %r8,%rdx movq %r9,%rbp subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 movq %r10,%r13 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 movq %r11,%rsi sbbq 40(%rbx),%r11 sbbq $0,%r12 cmovcq %rax,%r14 cmovcq %rcx,%r15 cmovcq %rdx,%r8 movq %r14,0(%rdi) cmovcq %rbp,%r9 movq %r15,8(%rdi) cmovcq %r13,%r10 movq %r8,16(%rdi) cmovcq %rsi,%r11 movq %r9,24(%rdi) movq %r10,32(%rdi) movq %r11,40(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sgn0x_pty_mont_384 .private_extern _sgn0x_pty_mont_384 .p2align 5 _sgn0x_pty_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sgn0_pty_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 0(%rdi),%rsi movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 xorq %rax,%rax movq %r14,%r13 addq %r14,%r14 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r14 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax notq %rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sgn0x_pty_mont_384x .private_extern _sgn0x_pty_mont_384x .p2align 5 _sgn0x_pty_mont_384x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sgn0_pty_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $8,%rsp .cfi_adjust_cfa_offset 8 movq %rsi,%rbx leaq 48(%rdi),%rsi movq %rdx,%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 leaq 0(%rdi),%rsi xorq %rdi,%rdi movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rdi subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rdi movq %r14,0(%rsp) notq %rdi andq $1,%r13 andq $2,%rdi orq %r13,%rdi call __mulx_by_1_mont_384 movq %r14,%r12 orq %r15,%r14 orq %r8,%r14 orq %r9,%r14 orq %r10,%r14 orq %r11,%r14 xorq %rax,%rax movq %r12,%r13 addq %r12,%r12 adcq %r15,%r15 adcq %r8,%r8 adcq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq $0,%rax subq 0(%rbx),%r12 sbbq 8(%rbx),%r15 sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 sbbq 32(%rbx),%r10 sbbq 40(%rbx),%r11 sbbq $0,%rax movq 0(%rsp),%r12 notq %rax testq %r14,%r14 cmovzq %rdi,%r13 testq %r12,%r12 cmovnzq %rdi,%rax andq $1,%r13 andq $2,%rax orq %r13,%rax movq 8(%rsp),%r15 .cfi_restore %r15 movq 16(%rsp),%r14 .cfi_restore %r14 movq 24(%rsp),%r13 .cfi_restore %r13 movq 32(%rsp),%r12 .cfi_restore %r12 movq 40(%rsp),%rbx .cfi_restore %rbx movq 48(%rsp),%rbp .cfi_restore %rbp leaq 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _mulx_mont_384 .private_extern _mulx_mont_384 .p2align 5 _mulx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -24(%rsp),%rsp .cfi_adjust_cfa_offset 8*3 movq %rdx,%rbx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rdx),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx movq %r8,(%rsp) mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 24(%rsp),%r15 .cfi_restore %r15 movq 32(%rsp),%r14 .cfi_restore %r14 movq 40(%rsp),%r13 .cfi_restore %r13 movq 48(%rsp),%r12 .cfi_restore %r12 movq 56(%rsp),%rbx .cfi_restore %rbx movq 64(%rsp),%rbp .cfi_restore %rbp leaq 72(%rsp),%rsp .cfi_adjust_cfa_offset -8*9 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r14,%r10 mulxq %rax,%r15,%r11 addq %r14,%r9 mulxq %r12,%rax,%r12 adcq %r15,%r10 mulxq %rdi,%rdi,%r13 adcq %rax,%r11 mulxq %rbp,%rbp,%r14 movq 8(%rbx),%rdx adcq %rdi,%r12 adcq %rbp,%r13 adcq $0,%r14 xorq %r15,%r15 movq %r8,16(%rsp) imulq 8(%rsp),%r8 xorq %rax,%rax mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r9 adcxq %rbp,%r10 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 40+128(%rsi),%rdi,%rbp movq %r8,%rdx adoxq %rdi,%r14 adcxq %rbp,%r15 adoxq %rax,%r15 adoxq %rax,%rax xorq %r8,%r8 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r9 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 40+128(%rcx),%rdi,%rbp movq 16(%rbx),%rdx adcxq %rdi,%r13 adoxq %rbp,%r14 adcxq %r8,%r14 adoxq %r8,%r15 adcxq %r8,%r15 adoxq %r8,%rax adcxq %r8,%rax movq %r9,16(%rsp) imulq 8(%rsp),%r9 xorq %r8,%r8 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 40+128(%rsi),%rdi,%rbp movq %r9,%rdx adoxq %rdi,%r15 adcxq %rbp,%rax adoxq %r8,%rax adoxq %r8,%r8 xorq %r9,%r9 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r10 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 40+128(%rcx),%rdi,%rbp movq 24(%rbx),%rdx adcxq %rdi,%r14 adoxq %rbp,%r15 adcxq %r9,%r15 adoxq %r9,%rax adcxq %r9,%rax adoxq %r9,%r8 adcxq %r9,%r8 movq %r10,16(%rsp) imulq 8(%rsp),%r10 xorq %r9,%r9 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 40+128(%rsi),%rdi,%rbp movq %r10,%rdx adoxq %rdi,%rax adcxq %rbp,%r8 adoxq %r9,%r8 adoxq %r9,%r9 xorq %r10,%r10 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r11 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 40+128(%rcx),%rdi,%rbp movq 32(%rbx),%rdx adcxq %rdi,%r15 adoxq %rbp,%rax adcxq %r10,%rax adoxq %r10,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcxq %r10,%r9 movq %r11,16(%rsp) imulq 8(%rsp),%r11 xorq %r10,%r10 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 40+128(%rsi),%rdi,%rbp movq %r11,%rdx adoxq %rdi,%r8 adcxq %rbp,%r9 adoxq %r10,%r9 adoxq %r10,%r10 xorq %r11,%r11 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r12 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 40+128(%rcx),%rdi,%rbp movq 40(%rbx),%rdx adcxq %rdi,%rax adoxq %rbp,%r8 adcxq %r11,%r8 adoxq %r11,%r9 adcxq %r11,%r9 adoxq %r11,%r10 adcxq %r11,%r10 movq %r12,16(%rsp) imulq 8(%rsp),%r12 xorq %r11,%r11 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r8 adcxq %rbp,%r9 mulxq 40+128(%rsi),%rdi,%rbp movq %r12,%rdx adoxq %rdi,%r9 adcxq %rbp,%r10 adoxq %r11,%r10 adoxq %r11,%r11 xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq 16(%rsp),%rdi adoxq %rbp,%r13 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 40+128(%rcx),%rdi,%rbp movq %r13,%rdx adcxq %rdi,%r8 adoxq %rbp,%r9 adcxq %r12,%r9 adoxq %r12,%r10 adcxq %r12,%r10 adoxq %r12,%r11 adcxq %r12,%r11 imulq 8(%rsp),%rdx movq 24(%rsp),%rbx xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 movq %r15,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r9 movq %rax,%rsi mulxq 40+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 movq %r14,%rdx adcxq %r12,%r10 adoxq %r12,%r11 leaq 128(%rcx),%rcx movq %r8,%r12 adcq $0,%r11 subq 0(%rcx),%r14 sbbq 8(%rcx),%r15 movq %r9,%rdi sbbq 16(%rcx),%rax sbbq 24(%rcx),%r8 sbbq 32(%rcx),%r9 movq %r10,%rbp sbbq 40(%rcx),%r10 sbbq $0,%r11 cmovncq %r14,%rdx cmovcq %r13,%r15 cmovcq %rsi,%rax cmovncq %r8,%r12 movq %rdx,0(%rbx) cmovncq %r9,%rdi movq %r15,8(%rbx) cmovncq %r10,%rbp movq %rax,16(%rbx) movq %r12,24(%rbx) movq %rdi,32(%rbx) movq %rbp,40(%rbx) #ifdef __SGX_LVI_HARDENING__ popq %rsi lfence jmpq *%rsi ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_mont_384 .private_extern _sqrx_mont_384 .p2align 5 _sqrx_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -24(%rsp),%rsp .cfi_adjust_cfa_offset 8*3 movq %rcx,%r8 leaq -128(%rdx),%rcx #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq (%rsi),%rbx movq %r8,(%rsp) leaq -128(%rsi),%rsi mulxq %rdx,%r8,%r9 call __mulx_mont_384 movq 24(%rsp),%r15 .cfi_restore %r15 movq 32(%rsp),%r14 .cfi_restore %r14 movq 40(%rsp),%r13 .cfi_restore %r13 movq 48(%rsp),%r12 .cfi_restore %r12 movq 56(%rsp),%rbx .cfi_restore %rbx movq 64(%rsp),%rbp .cfi_restore %rbp leaq 72(%rsp),%rsp .cfi_adjust_cfa_offset -8*9 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_n_mul_mont_384 .private_extern _sqrx_n_mul_mont_384 .p2align 5 _sqrx_n_mul_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_n_mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -40(%rsp),%rsp .cfi_adjust_cfa_offset 8*5 movq %rdx,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq %rsi,%rbx movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp movq %r8,(%rsp) movq %r9,24(%rsp) movq 0(%r9),%xmm2 L$oop_sqrx_384: movd %r10d,%xmm1 leaq -128(%rbx),%rsi leaq -128(%rcx),%rcx mulxq %rdx,%r8,%r9 call __mulx_mont_384 movd %xmm1,%r10d decl %r10d jnz L$oop_sqrx_384 movq %rdx,%r14 .byte 102,72,15,126,210 leaq -128(%rbx),%rsi movq 24(%rsp),%rbx leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 40(%rsp),%r15 .cfi_restore %r15 movq 48(%rsp),%r14 .cfi_restore %r14 movq 56(%rsp),%r13 .cfi_restore %r13 movq 64(%rsp),%r12 .cfi_restore %r12 movq 72(%rsp),%rbx .cfi_restore %rbx movq 80(%rsp),%rbp .cfi_restore %rbp leaq 88(%rsp),%rsp .cfi_adjust_cfa_offset -8*11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_n_mul_mont_383 .private_extern _sqrx_n_mul_mont_383 .p2align 5 _sqrx_n_mul_mont_383: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_n_mul_mont_383$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -40(%rsp),%rsp .cfi_adjust_cfa_offset 8*5 movq %rdx,%r10 #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%rdx movq 8(%rsi),%r15 movq 16(%rsi),%rax movq %rsi,%rbx movq 24(%rsi),%r12 movq %rdi,16(%rsp) movq 32(%rsi),%rdi movq 40(%rsi),%rbp movq %r8,(%rsp) movq %r9,24(%rsp) movq 0(%r9),%xmm2 leaq -128(%rcx),%rcx L$oop_sqrx_383: movd %r10d,%xmm1 leaq -128(%rbx),%rsi mulxq %rdx,%r8,%r9 call __mulx_mont_383_nonred movd %xmm1,%r10d decl %r10d jnz L$oop_sqrx_383 movq %rdx,%r14 .byte 102,72,15,126,210 leaq -128(%rbx),%rsi movq 24(%rsp),%rbx mulxq %r14,%r8,%r9 call __mulx_mont_384 movq 40(%rsp),%r15 .cfi_restore %r15 movq 48(%rsp),%r14 .cfi_restore %r14 movq 56(%rsp),%r13 .cfi_restore %r13 movq 64(%rsp),%r12 .cfi_restore %r12 movq 72(%rsp),%rbx .cfi_restore %rbx movq 80(%rsp),%rbp .cfi_restore %rbp leaq 88(%rsp),%rsp .cfi_adjust_cfa_offset -8*11 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .p2align 5 __mulx_mont_383_nonred: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa mulxq %r15,%r14,%r10 mulxq %rax,%r15,%r11 addq %r14,%r9 mulxq %r12,%rax,%r12 adcq %r15,%r10 mulxq %rdi,%rdi,%r13 adcq %rax,%r11 mulxq %rbp,%rbp,%r14 movq 8(%rbx),%rdx adcq %rdi,%r12 adcq %rbp,%r13 adcq $0,%r14 movq %r8,%rax imulq 8(%rsp),%r8 xorq %r15,%r15 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r9 adcxq %rbp,%r10 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 40+128(%rsi),%rdi,%rbp movq %r8,%rdx adoxq %rdi,%r14 adcxq %r15,%rbp adoxq %rbp,%r15 xorq %r8,%r8 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r9 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r10 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 40+128(%rcx),%rdi,%rbp movq 16(%rbx),%rdx adcxq %rdi,%r13 adoxq %rbp,%r14 adcxq %rax,%r14 adoxq %rax,%r15 adcxq %rax,%r15 movq %r9,%r8 imulq 8(%rsp),%r9 xorq %rax,%rax mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r10 adcxq %rbp,%r11 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 40+128(%rsi),%rdi,%rbp movq %r9,%rdx adoxq %rdi,%r15 adcxq %rax,%rbp adoxq %rbp,%rax xorq %r9,%r9 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r10 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r11 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 40+128(%rcx),%rdi,%rbp movq 24(%rbx),%rdx adcxq %rdi,%r14 adoxq %rbp,%r15 adcxq %r8,%r15 adoxq %r8,%rax adcxq %r8,%rax movq %r10,%r9 imulq 8(%rsp),%r10 xorq %r8,%r8 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r11 adcxq %rbp,%r12 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 40+128(%rsi),%rdi,%rbp movq %r10,%rdx adoxq %rdi,%rax adcxq %r8,%rbp adoxq %rbp,%r8 xorq %r10,%r10 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r9 adoxq %rbp,%r11 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r12 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 40+128(%rcx),%rdi,%rbp movq 32(%rbx),%rdx adcxq %rdi,%r15 adoxq %rbp,%rax adcxq %r9,%rax adoxq %r9,%r8 adcxq %r9,%r8 movq %r11,%r10 imulq 8(%rsp),%r11 xorq %r9,%r9 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r12 adcxq %rbp,%r13 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 40+128(%rsi),%rdi,%rbp movq %r11,%rdx adoxq %rdi,%r8 adcxq %r9,%rbp adoxq %rbp,%r9 xorq %r11,%r11 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r10 adoxq %rbp,%r12 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r12 adoxq %rbp,%r13 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 40+128(%rcx),%rdi,%rbp movq 40(%rbx),%rdx adcxq %rdi,%rax adoxq %rbp,%r8 adcxq %r10,%r8 adoxq %r10,%r9 adcxq %r10,%r9 movq %r12,%r11 imulq 8(%rsp),%r12 xorq %r10,%r10 mulxq 0+128(%rsi),%rdi,%rbp adoxq %rdi,%r13 adcxq %rbp,%r14 mulxq 8+128(%rsi),%rdi,%rbp adoxq %rdi,%r14 adcxq %rbp,%r15 mulxq 16+128(%rsi),%rdi,%rbp adoxq %rdi,%r15 adcxq %rbp,%rax mulxq 24+128(%rsi),%rdi,%rbp adoxq %rdi,%rax adcxq %rbp,%r8 mulxq 32+128(%rsi),%rdi,%rbp adoxq %rdi,%r8 adcxq %rbp,%r9 mulxq 40+128(%rsi),%rdi,%rbp movq %r12,%rdx adoxq %rdi,%r9 adcxq %r10,%rbp adoxq %rbp,%r10 xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r11 adoxq %rbp,%r13 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 40+128(%rcx),%rdi,%rbp movq %r13,%rdx adcxq %rdi,%r8 adoxq %rbp,%r9 adcxq %r11,%r9 adoxq %r11,%r10 adcxq %r11,%r10 imulq 8(%rsp),%rdx movq 24(%rsp),%rbx xorq %r12,%r12 mulxq 0+128(%rcx),%rdi,%rbp adcxq %rdi,%r13 adoxq %rbp,%r14 mulxq 8+128(%rcx),%rdi,%rbp adcxq %rdi,%r14 adoxq %rbp,%r15 mulxq 16+128(%rcx),%rdi,%rbp adcxq %rdi,%r15 adoxq %rbp,%rax mulxq 24+128(%rcx),%rdi,%rbp adcxq %rdi,%rax adoxq %rbp,%r8 mulxq 32+128(%rcx),%rdi,%rbp adcxq %rdi,%r8 adoxq %rbp,%r9 mulxq 40+128(%rcx),%rdi,%rbp movq %r14,%rdx adcxq %rdi,%r9 adoxq %rbp,%r10 adcq $0,%r10 movq %r8,%r12 movq %r14,0(%rbx) movq %r15,8(%rbx) movq %rax,16(%rbx) movq %r9,%rdi movq %r8,24(%rbx) movq %r9,32(%rbx) movq %r10,40(%rbx) movq %r10,%rbp #ifdef __SGX_LVI_HARDENING__ popq %rsi lfence jmpq *%rsi ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _sqrx_mont_382x .private_extern _sqrx_mont_382x .p2align 5 _sqrx_mont_382x: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa L$sqr_mont_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $136,%rsp .cfi_adjust_cfa_offset 136 movq %rcx,0(%rsp) movq %rdx,%rcx movq %rdi,16(%rsp) movq %rsi,24(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq %r8,%r14 addq 48(%rsi),%r8 movq %r9,%r15 adcq 56(%rsi),%r9 movq %r10,%rax adcq 64(%rsi),%r10 movq %r11,%rdx adcq 72(%rsi),%r11 movq %r12,%rbx adcq 80(%rsi),%r12 movq %r13,%rbp adcq 88(%rsi),%r13 subq 48(%rsi),%r14 sbbq 56(%rsi),%r15 sbbq 64(%rsi),%rax sbbq 72(%rsi),%rdx sbbq 80(%rsi),%rbx sbbq 88(%rsi),%rbp sbbq %rdi,%rdi movq %r8,32+0(%rsp) movq %r9,32+8(%rsp) movq %r10,32+16(%rsp) movq %r11,32+24(%rsp) movq %r12,32+32(%rsp) movq %r13,32+40(%rsp) movq %r14,32+48(%rsp) movq %r15,32+56(%rsp) movq %rax,32+64(%rsp) movq %rdx,32+72(%rsp) movq %rbx,32+80(%rsp) movq %rbp,32+88(%rsp) movq %rdi,32+96(%rsp) leaq 48(%rsi),%rbx movq 48(%rsi),%rdx movq 0(%rsi),%r14 movq 8(%rsi),%r15 movq 16(%rsi),%rax movq 24(%rsi),%r12 movq 32(%rsi),%rdi movq 40(%rsi),%rbp leaq -128(%rsi),%rsi leaq -128(%rcx),%rcx mulxq %r14,%r8,%r9 call __mulx_mont_383_nonred addq %rdx,%rdx adcq %r15,%r15 adcq %rax,%rax adcq %r12,%r12 adcq %rdi,%rdi adcq %rbp,%rbp movq %rdx,48(%rbx) movq %r15,56(%rbx) movq %rax,64(%rbx) movq %r12,72(%rbx) movq %rdi,80(%rbx) movq %rbp,88(%rbx) leaq 32-128(%rsp),%rsi leaq 32+48(%rsp),%rbx movq 32+48(%rsp),%rdx movq 32+0(%rsp),%r14 movq 32+8(%rsp),%r15 movq 32+16(%rsp),%rax movq 32+24(%rsp),%r12 movq 32+32(%rsp),%rdi movq 32+40(%rsp),%rbp mulxq %r14,%r8,%r9 call __mulx_mont_383_nonred movq 32+96(%rsp),%r14 leaq 128(%rcx),%rcx movq 32+0(%rsp),%r8 andq %r14,%r8 movq 32+8(%rsp),%r9 andq %r14,%r9 movq 32+16(%rsp),%r10 andq %r14,%r10 movq 32+24(%rsp),%r11 andq %r14,%r11 movq 32+32(%rsp),%r13 andq %r14,%r13 andq 32+40(%rsp),%r14 subq %r8,%rdx movq 0(%rcx),%r8 sbbq %r9,%r15 movq 8(%rcx),%r9 sbbq %r10,%rax movq 16(%rcx),%r10 sbbq %r11,%r12 movq 24(%rcx),%r11 sbbq %r13,%rdi movq 32(%rcx),%r13 sbbq %r14,%rbp sbbq %r14,%r14 andq %r14,%r8 andq %r14,%r9 andq %r14,%r10 andq %r14,%r11 andq %r14,%r13 andq 40(%rcx),%r14 addq %r8,%rdx adcq %r9,%r15 adcq %r10,%rax adcq %r11,%r12 adcq %r13,%rdi adcq %r14,%rbp movq %rdx,0(%rbx) movq %r15,8(%rbx) movq %rax,16(%rbx) movq %r12,24(%rbx) movq %rdi,32(%rbx) movq %rbp,40(%rbx) leaq 136(%rsp),%r8 movq 0(%r8),%r15 .cfi_restore %r15 movq 8(%r8),%r14 .cfi_restore %r14 movq 16(%r8),%r13 .cfi_restore %r13 movq 24(%r8),%r12 .cfi_restore %r12 movq 32(%r8),%rbx .cfi_restore %rbx movq 40(%r8),%rbp .cfi_restore %rbp leaq 48(%r8),%rsp .cfi_adjust_cfa_offset -136-8*6 #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/mach-o/sha256-armv8.S ================================================ // // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 // // ==================================================================== // Written by Andy Polyakov, @dot-asm, initially for the OpenSSL // project. // ==================================================================== // // sha256_block procedure for ARMv8. // // This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. .comm ___blst_platform_cap,4 .text .align 6 LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 .align 2 .globl _blst_sha256_block_armv8 .private_extern _blst_sha256_block_armv8 .align 6 _blst_sha256_block_armv8: hint #34 Lv8_entry: stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] adr x3,LK256 Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 ld1 {v16.4s},[x3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s .long 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s .long 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s .long 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s .long 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b .long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,Loop_hw st1 {v0.4s,v1.4s},[x0] ldr x29,[sp],#2*__SIZEOF_POINTER__ ret .globl _blst_sha256_block_data_order .private_extern _blst_sha256_block_data_order .align 4 _blst_sha256_block_data_order: hint #34 adrp x16,___blst_platform_cap@PAGE ldr w16,[x16,___blst_platform_cap@PAGEOFF] tst w16,#1 b.ne Lv8_entry stp x29, x30, [sp, #-2*__SIZEOF_POINTER__]! mov x29, sp sub sp,sp,#16*4 adr x16,LK256 add x2,x1,x2,lsl#6 // len to point at the end of inp ld1 {v0.16b},[x1], #16 ld1 {v1.16b},[x1], #16 ld1 {v2.16b},[x1], #16 ld1 {v3.16b},[x1], #16 ld1 {v4.4s},[x16], #16 ld1 {v5.4s},[x16], #16 ld1 {v6.4s},[x16], #16 ld1 {v7.4s},[x16], #16 rev32 v0.16b,v0.16b // yes, even on rev32 v1.16b,v1.16b // big-endian rev32 v2.16b,v2.16b rev32 v3.16b,v3.16b mov x17,sp add v4.4s,v4.4s,v0.4s add v5.4s,v5.4s,v1.4s add v6.4s,v6.4s,v2.4s st1 {v4.4s,v5.4s},[x17], #32 add v7.4s,v7.4s,v3.4s st1 {v6.4s,v7.4s},[x17] sub x17,x17,#32 ldp w3,w4,[x0] ldp w5,w6,[x0,#8] ldp w7,w8,[x0,#16] ldp w9,w10,[x0,#24] ldr w12,[sp,#0] mov w13,wzr eor w14,w4,w5 mov w15,wzr b L_00_48 .align 4 L_00_48: ext v4.16b,v0.16b,v1.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v2.16b,v3.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v3.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v0.4s,v0.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v0.4s,v0.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v0.4s,v0.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v0.4s,#17 orr w12,w12,w15 ushr v19.4s,v0.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v0.4s,#15 add w8,w8,w12 ushr v17.4s,v0.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v0.4s,#13 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v0.4s,v0.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v0.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext v4.16b,v1.16b,v2.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v3.16b,v0.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v0.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v1.4s,v1.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v1.4s,v1.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v1.4s,v1.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v1.4s,#17 orr w12,w12,w15 ushr v19.4s,v1.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v1.4s,#15 add w4,w4,w12 ushr v17.4s,v1.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v1.4s,#13 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v1.4s,v1.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v1.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 ext v4.16b,v2.16b,v3.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext v7.16b,v0.16b,v1.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v1.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v2.4s,v2.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v2.4s,v2.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v2.4s,v2.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v2.4s,#17 orr w12,w12,w15 ushr v19.4s,v2.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v2.4s,#15 add w8,w8,w12 ushr v17.4s,v2.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v2.4s,#13 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v2.4s,v2.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v2.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext v4.16b,v3.16b,v0.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext v7.16b,v1.16b,v2.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v2.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v3.4s,v3.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v3.4s,v3.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v3.4s,v3.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v3.4s,#17 orr w12,w12,w15 ushr v19.4s,v3.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v3.4s,#15 add w4,w4,w12 ushr v17.4s,v3.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v3.4s,#13 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v3.4s,v3.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v3.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[x16] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 cmp w12,#0 // check for K256 terminator ldr w12,[sp,#0] sub x17,x17,#64 bne L_00_48 sub x16,x16,#256 cmp x1,x2 mov x17, #-64 csel x17, x17, xzr, eq add x1,x1,x17 mov x17,sp add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v0.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v0.16b,v0.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v0.4s add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v1.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v1.16b,v1.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v1.4s add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v2.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v2.16b,v2.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v2.4s add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v3.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v3.16b,v3.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v3.4s add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w3,w3,w15 // h+=Sigma0(a) from the past ldp w11,w12,[x0,#0] add w3,w3,w13 // h+=Maj(a,b,c) from the past ldp w13,w14,[x0,#8] add w3,w3,w11 // accumulate add w4,w4,w12 ldp w11,w12,[x0,#16] add w5,w5,w13 add w6,w6,w14 ldp w13,w14,[x0,#24] add w7,w7,w11 add w8,w8,w12 ldr w12,[sp,#0] stp w3,w4,[x0,#0] add w9,w9,w13 mov w13,wzr stp w5,w6,[x0,#8] add w10,w10,w14 stp w7,w8,[x0,#16] eor w14,w4,w5 stp w9,w10,[x0,#24] mov w15,wzr mov x17,sp b.ne L_00_48 ldr x29,[x29] add sp,sp,#16*4+2*__SIZEOF_POINTER__ ret .globl _blst_sha256_emit .private_extern _blst_sha256_emit .align 4 _blst_sha256_emit: hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] #ifndef __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 #endif str w4,[x0,#4] lsr x4,x4,#32 str w5,[x0,#12] lsr x5,x5,#32 str w6,[x0,#20] lsr x6,x6,#32 str w7,[x0,#28] lsr x7,x7,#32 str w4,[x0,#0] str w5,[x0,#8] str w6,[x0,#16] str w7,[x0,#24] ret .globl _blst_sha256_bcopy .private_extern _blst_sha256_bcopy .align 4 _blst_sha256_bcopy: hint #34 Loop_bcopy: ldrb w3,[x1],#1 sub x2,x2,#1 strb w3,[x0],#1 cbnz x2,Loop_bcopy ret .globl _blst_sha256_hcopy .private_extern _blst_sha256_hcopy .align 4 _blst_sha256_hcopy: hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] stp x4,x5,[x0] stp x6,x7,[x0,#16] ret ================================================ FILE: build/mach-o/sha256-portable-x86_64.s ================================================ .comm ___blst_platform_cap,4 .text .globl _blst_sha256_block_data_order .p2align 4 _blst_sha256_block_data_order: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp #ifdef __BLST_PORTABLE__ testl $2,___blst_platform_cap(%rip) jnz L$blst_sha256_block_data_order$2 #endif pushq %rbx .cfi_offset %rbx,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $64+24,%rsp .cfi_def_cfa %rsp,144 leaq (%rsi,%rdx,4),%rdx movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp L$loop .p2align 4 L$loop: movl %ebx,%edi leaq K256(%rip),%rbp xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 0(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 4(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 8(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 12(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 16(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 20(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 24(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 28(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 32(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 36(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 40(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 44(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 48(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 52(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 56(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 60(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax jmp L$rounds_16_xx .p2align 4 L$rounds_16_xx: movl 4(%rsp),%r13d movl 56(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 64(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d movl 8(%rsp),%r13d movl 60(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 68(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d movl 12(%rsp),%r13d movl 0(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 72(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d movl 16(%rsp),%r13d movl 4(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 76(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d movl 20(%rsp),%r13d movl 8(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 80(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx movl 24(%rsp),%r13d movl 12(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 84(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx movl 28(%rsp),%r13d movl 16(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 88(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx movl 32(%rsp),%r13d movl 20(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 92(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax movl 36(%rsp),%r13d movl 24(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl 96(%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d movl 40(%rsp),%r13d movl 28(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl 100(%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d movl 44(%rsp),%r13d movl 32(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl 104(%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d movl 48(%rsp),%r13d movl 36(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl 108(%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d movl 52(%rsp),%r13d movl 40(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl 112(%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx movl 56(%rsp),%r13d movl 44(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl 116(%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx movl 60(%rsp),%r13d movl 48(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl 120(%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx movl 0(%rsp),%r13d movl 52(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl 124(%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 64(%rbp),%rbp cmpb $0x19,3(%rbp) jnz L$rounds_16_xx movq 64+0(%rsp),%rdi addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb L$loop leaq 64+24+48(%rsp),%r11 .cfi_def_cfa %r11,8 movq 64+24(%rsp),%r15 movq -40(%r11),%r14 movq -32(%r11),%r13 movq -24(%r11),%r12 movq -16(%r11),%rbx movq -8(%r11),%rbp .cfi_restore %r12 .cfi_restore %r13 .cfi_restore %r14 .cfi_restore %r15 .cfi_restore %rbp .cfi_restore %rbx leaq (%r11),%rsp #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc #ifndef __BLST_PORTABLE__ .section __TEXT,__const .p2align 6 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .globl _blst_sha256_emit .private_extern _blst_sha256_emit .p2align 4 _blst_sha256_emit: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 bswapq %r8 movq 24(%rsi),%r11 bswapq %r9 movl %r8d,4(%rdi) bswapq %r10 movl %r9d,12(%rdi) bswapq %r11 movl %r10d,20(%rdi) shrq $32,%r8 movl %r11d,28(%rdi) shrq $32,%r9 movl %r8d,0(%rdi) shrq $32,%r10 movl %r9d,8(%rdi) shrq $32,%r11 movl %r10d,16(%rdi) movl %r11d,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _blst_sha256_bcopy .private_extern _blst_sha256_bcopy .p2align 4 _blst_sha256_bcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa subq %rsi,%rdi L$oop_bcopy: movzbl (%rsi),%eax leaq 1(%rsi),%rsi movb %al,-1(%rdi,%rsi,1) decq %rdx jnz L$oop_bcopy #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _blst_sha256_hcopy .private_extern _blst_sha256_hcopy .p2align 4 _blst_sha256_hcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc #endif ================================================ FILE: build/mach-o/sha256-x86_64.s ================================================ .comm ___blst_platform_cap,4 .section __TEXT,__const .p2align 6 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .text .globl _blst_sha256_block_data_order_shaext .private_extern _blst_sha256_block_data_order_shaext .p2align 6 _blst_sha256_block_data_order_shaext: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp L$blst_sha256_block_data_order$2: #ifdef __SGX_LVI_HARDENING__ lfence #endif leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa 256-128(%rcx),%xmm7 pshufd $0x1b,%xmm1,%xmm0 pshufd $0xb1,%xmm1,%xmm1 pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 jmp L$oop_shaext .p2align 4 L$oop_shaext: movdqu (%rsi),%xmm3 movdqu 16(%rsi),%xmm4 movdqu 32(%rsi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%rsi),%xmm6 movdqa 0-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 movdqa 16-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 movdqa 32-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 48-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 64-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 80-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 96-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 112-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 128-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 144-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 160-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 176-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 192-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 208-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 224-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 movdqa 240-128(%rcx),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 paddd %xmm10,%xmm2 paddd %xmm9,%xmm1 jnz L$oop_shaext pshufd $0xb1,%xmm2,%xmm2 pshufd $0x1b,%xmm1,%xmm7 pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) .cfi_def_cfa_register %rsp popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _blst_sha256_block_data_order .private_extern _blst_sha256_block_data_order .p2align 6 _blst_sha256_block_data_order: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 movq %rsp,%rbp .cfi_def_cfa_register %rbp #ifndef __SGX_LVI_HARDENING__ testl $2,___blst_platform_cap(%rip) jnz L$blst_sha256_block_data_order$2 #endif pushq %rbx .cfi_offset %rbx,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $24,%rsp leaq (%rsi,%rdx,4),%rdx movq %rdi,-64(%rbp) movq %rdx,-48(%rbp) leaq -64(%rsp),%rsp #ifdef __SGX_LVI_HARDENING__ lfence #endif movl 0(%rdi),%eax andq $-64,%rsp movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp L$loop_ssse3 .p2align 4 L$loop_ssse3: movdqa K256+256(%rip),%xmm7 movq %rsi,-56(%rbp) movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 .byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rsi .byte 102,15,56,0,207 movdqa 0(%rsi),%xmm4 movdqa 16(%rsi),%xmm5 .byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 32(%rsi),%xmm6 .byte 102,15,56,0,223 movdqa 48(%rsi),%xmm7 paddd %xmm1,%xmm5 paddd %xmm2,%xmm6 paddd %xmm3,%xmm7 movdqa %xmm4,0(%rsp) movl %eax,%r14d movdqa %xmm5,16(%rsp) movl %ebx,%edi movdqa %xmm6,32(%rsp) xorl %ecx,%edi movdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp L$ssse3_00_47 .p2align 4 L$ssse3_00_47: subq $-64,%rsi rorl $14,%r13d movdqa %xmm1,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm3,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,224,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,250,4 addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm0 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm3,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 4(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm0 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm0 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm0,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 0(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm0 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm0,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,0(%rsp) rorl $14,%r13d movdqa %xmm2,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm0,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,225,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,251,4 addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm1 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm0,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 20(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm1 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm1 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm1,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 16(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm1 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm1,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,16(%rsp) rorl $14,%r13d movdqa %xmm3,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm1,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,226,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,248,4 addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm2 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm1,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 36(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm2 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm2 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm2,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 32(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm2 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm2,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,32(%rsp) rorl $14,%r13d movdqa %xmm0,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm2,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,227,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,249,4 addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm3 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm2,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 52(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm3 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm3 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm3,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 48(%rsi),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm3 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm3,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,48(%rsp) cmpb $0,67(%rsi) jne L$ssse3_00_47 rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq -64(%rbp),%rdi movl %r14d,%eax movq -56(%rbp),%rsi #ifdef __SGX_LVI_HARDENING__ lfence #endif addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d leaq 64(%rsi),%rsi cmpq -48(%rbp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb L$loop_ssse3 xorps %xmm0,%xmm0 movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) movaps %xmm0,48(%rsp) movq -40(%rbp),%r15 movq -32(%rbp),%r14 movq -24(%rbp),%r13 movq -16(%rbp),%r12 movq -8(%rbp),%rbx movq %rbp,%rsp .cfi_def_cfa_register %rsp popq %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp .cfi_restore %r12 .cfi_restore %r13 .cfi_restore %r14 .cfi_restore %r15 .cfi_restore %rbx #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _blst_sha256_emit .private_extern _blst_sha256_emit .p2align 4 _blst_sha256_emit: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 bswapq %r8 movq 24(%rsi),%r11 bswapq %r9 movl %r8d,4(%rdi) bswapq %r10 movl %r9d,12(%rdi) bswapq %r11 movl %r10d,20(%rdi) shrq $32,%r8 movl %r11d,28(%rdi) shrq $32,%r9 movl %r8d,0(%rdi) shrq $32,%r10 movl %r9d,8(%rdi) shrq $32,%r11 movl %r10d,16(%rdi) movl %r11d,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _blst_sha256_bcopy .private_extern _blst_sha256_bcopy .p2align 4 _blst_sha256_bcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif subq %rsi,%rdi L$oop_bcopy: movzbl (%rsi),%eax leaq 1(%rsi),%rsi movb %al,-1(%rdi,%rsi,1) decq %rdx jnz L$oop_bcopy #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc .globl _blst_sha256_hcopy .private_extern _blst_sha256_hcopy .p2align 4 _blst_sha256_hcopy: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa #ifdef __SGX_LVI_HARDENING__ lfence #endif movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) #ifdef __SGX_LVI_HARDENING__ popq %rdx lfence jmpq *%rdx ud2 #else .byte 0xf3,0xc3 #endif .cfi_endproc ================================================ FILE: build/refresh.sh ================================================ #!/bin/sh HERE=`dirname $0` cd "${HERE}" PERL=${PERL:-perl} for pl in ../src/asm/*-x86_64.pl; do s=`basename $pl .pl`.asm expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s) s=`basename $pl .pl`.s (set -x; ${PERL} $pl elf > elf/$s) (set -x; ${PERL} $pl mingw64 > coff/$s) (set -x; ${PERL} $pl macosx > mach-o/$s) done for pl in ../src/asm/*-armv8.pl; do s=`basename $pl .pl`.asm (set -x; ${PERL} $pl win64 > win64/$s) s=`basename $pl .pl`.S (set -x; ${PERL} $pl linux64 > elf/$s) (set -x; ${PERL} $pl coff64 > coff/$s) (set -x; ${PERL} $pl ios64 > mach-o/$s) (set -x; ${PERL} $pl cheri64 > cheri/$s) done ( cd ../bindings; echo "LIBRARY blst" echo echo "EXPORTS" cc -E blst.h | \ ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) && print "\t$1\n" }' echo ) > win64/blst.def if which bindgen > /dev/null 2>&1; then ( cd ../bindings; set -x; bindgen --opaque-type blst_pairing \ --opaque-type blst_uniq \ --with-derive-default \ --with-derive-eq \ --rustified-enum BLST.\* \ blst.h -- -D__BLST_RUST_BINDGEN__ \ | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs ) else echo "Install Rust bindgen with 'cargo install bindgen-cli'" 1>&2 exit 1 fi ================================================ FILE: build/srcroot.go ================================================ package blst import ( "path/filepath" "runtime" ) var SrcRoot string func init() { if _, self, _, ok := runtime.Caller(0); ok { SrcRoot = filepath.Dir(filepath.Dir(self)) } } ================================================ FILE: build/win64/add_mod_256-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |add_mod_256|[FUNC] ALIGN 32 |add_mod_256| PROC hint #34 ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] adds x8,x8,x12 ldp x14,x15,[x2,#16] adcs x9,x9,x13 ldp x4,x5,[x3] adcs x10,x10,x14 ldp x6,x7,[x3,#16] adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csello x8,x8,x16 csello x9,x9,x17 csello x10,x10,x1 stp x8,x9,[x0] csello x11,x11,x2 stp x10,x11,[x0,#16] ret ENDP EXPORT |mul_by_3_mod_256|[FUNC] ALIGN 32 |mul_by_3_mod_256| PROC hint #34 ldp x12,x13,[x1] ldp x14,x15,[x1,#16] adds x8,x12,x12 ldp x4,x5,[x2] adcs x9,x13,x13 ldp x6,x7,[x2,#16] adcs x10,x14,x14 adcs x11,x15,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csello x8,x8,x16 csello x9,x9,x17 csello x10,x10,x1 csello x11,x11,x2 adds x8,x8,x12 adcs x9,x9,x13 adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csello x8,x8,x16 csello x9,x9,x17 csello x10,x10,x1 stp x8,x9,[x0] csello x11,x11,x2 stp x10,x11,[x0,#16] ret ENDP EXPORT |lshift_mod_256|[FUNC] ALIGN 32 |lshift_mod_256| PROC hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] |$Loop_lshift_mod_256| adds x8,x8,x8 sub x2,x2,#1 adcs x9,x9,x9 adcs x10,x10,x10 adcs x11,x11,x11 adc x3,xzr,xzr subs x12,x8,x4 sbcs x13,x9,x5 sbcs x14,x10,x6 sbcs x15,x11,x7 sbcs xzr,x3,xzr csello x8,x8,x12 csello x9,x9,x13 csello x10,x10,x14 csello x11,x11,x15 cbnz x2,|$Loop_lshift_mod_256| stp x8,x9,[x0] stp x10,x11,[x0,#16] ret ENDP EXPORT |rshift_mod_256|[FUNC] ALIGN 32 |rshift_mod_256| PROC hint #34 ldp x8,x9,[x1] ldp x10,x11,[x1,#16] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] |$Loop_rshift| adds x12,x8,x4 sub x2,x2,#1 adcs x13,x9,x5 adcs x14,x10,x6 adcs x15,x11,x7 adc x3,xzr,xzr tst x8,#1 cselne x12,x12,x8 cselne x13,x13,x9 cselne x14,x14,x10 cselne x15,x15,x11 cselne x3,x3,xzr extr x8,x13,x12,#1 extr x9,x14,x13,#1 extr x10,x15,x14,#1 extr x11,x3,x15,#1 cbnz x2,|$Loop_rshift| stp x8,x9,[x0] stp x10,x11,[x0,#16] ret ENDP EXPORT |cneg_mod_256|[FUNC] ALIGN 32 |cneg_mod_256| PROC ldp x8,x9,[x1] ldp x4,x5,[x3] ldp x10,x11,[x1,#16] subs x12,x4,x8 ldp x6,x7,[x3,#16] orr x4,x8,x9 sbcs x13,x5,x9 orr x5,x10,x11 sbcs x14,x6,x10 orr x3,x4,x5 sbc x15,x7,x11 cmp x3,#0 csetmne x3 ands x2,x2,x3 cseleq x8,x8,x12 cseleq x9,x9,x13 cseleq x10,x10,x14 stp x8,x9,[x0] cseleq x11,x11,x15 stp x10,x11,[x0,#16] ret ENDP EXPORT |sub_mod_256|[FUNC] ALIGN 32 |sub_mod_256| PROC ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] subs x8,x8,x12 ldp x14,x15,[x2,#16] sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 stp x8,x9,[x0] adc x11,x11,x7 stp x10,x11,[x0,#16] ret ENDP EXPORT |check_mod_256|[FUNC] ALIGN 32 |check_mod_256| PROC ldp x8,x9,[x0] ldp x10,x11,[x0,#16] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] if :def: __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 endif subs xzr,x8,x4 sbcs xzr,x9,x5 orr x8,x8,x9 sbcs xzr,x10,x6 orr x8,x8,x10 sbcs xzr,x11,x7 orr x8,x8,x11 sbc x1,xzr,xzr cmp x8,#0 mov x0,#1 cselne x0,x0,xzr and x0,x0,x1 ret ENDP EXPORT |add_n_check_mod_256|[FUNC] ALIGN 32 |add_n_check_mod_256| PROC ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] if :def: __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 endif adds x8,x8,x12 ldp x4,x5,[x3] adcs x9,x9,x13 ldp x6,x7,[x3,#16] adcs x10,x10,x14 adcs x11,x11,x15 adc x3,xzr,xzr subs x16,x8,x4 sbcs x17,x9,x5 sbcs x1,x10,x6 sbcs x2,x11,x7 sbcs xzr,x3,xzr csello x8,x8,x16 csello x9,x9,x17 csello x10,x10,x1 csello x11,x11,x2 orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 if :def: __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 cselne x0,x17,xzr ret ENDP EXPORT |sub_n_check_mod_256|[FUNC] ALIGN 32 |sub_n_check_mod_256| PROC ldp x8,x9,[x1] ldp x12,x13,[x2] ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] if :def: __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 rev x10,x10 rev x14,x14 rev x11,x11 rev x15,x15 endif subs x8,x8,x12 sbcs x9,x9,x13 ldp x4,x5,[x3] sbcs x10,x10,x14 ldp x6,x7,[x3,#16] sbcs x11,x11,x15 sbc x3,xzr,xzr and x4,x4,x3 and x5,x5,x3 adds x8,x8,x4 and x6,x6,x3 adcs x9,x9,x5 and x7,x7,x3 adcs x10,x10,x6 adc x11,x11,x7 orr x16, x8, x9 orr x17, x10, x11 orr x16, x16, x17 if :def: __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 endif stp x8,x9,[x0] stp x10,x11,[x0,#16] mov x17, #1 cmp x16, #0 cselne x0,x17,xzr ret ENDP END ================================================ FILE: build/win64/add_mod_256-x86_64.asm ================================================ OPTION DOTNAME .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC add_mod_256 ALIGN 32 add_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx sub rsp,8 $L$SEH_body_add_mod_256:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] $L$oaded_a_add_mod_256:: add r8,QWORD PTR[rdx] adc r9,QWORD PTR[8+rdx] mov rax,r8 adc r10,QWORD PTR[16+rdx] mov rsi,r9 adc r11,QWORD PTR[24+rdx] sbb rdx,rdx mov rbx,r10 sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rcx] mov rbp,r11 sbb r11,QWORD PTR[24+rcx] sbb rdx,0 cmovc r8,rax cmovc r9,rsi mov QWORD PTR[rdi],r8 cmovc r10,rbx mov QWORD PTR[8+rdi],r9 cmovc r11,rbp mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_add_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_add_mod_256:: add_mod_256 ENDP PUBLIC mul_by_3_mod_256 ALIGN 32 mul_by_3_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_3_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 push rbx push r12 $L$SEH_body_mul_by_3_mod_256:: mov rcx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov rdx,rsi mov r11,QWORD PTR[24+rsi] call __lshift_mod_256 mov r12,QWORD PTR[rsp] jmp $L$oaded_a_add_mod_256 mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_mul_by_3_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_by_3_mod_256:: mul_by_3_mod_256 ENDP ALIGN 32 __lshift_mod_256 PROC PRIVATE DB 243,15,30,250 add r8,r8 adc r9,r9 mov rax,r8 adc r10,r10 mov rsi,r9 adc r11,r11 sbb r12,r12 mov rbx,r10 sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rcx] mov rbp,r11 sbb r11,QWORD PTR[24+rcx] sbb r12,0 cmovc r8,rax cmovc r9,rsi cmovc r10,rbx cmovc r11,rbp ifdef __SGX_LVI_HARDENING__ pop rax lfence jmp rax ud2 else DB 0F3h,0C3h endif __lshift_mod_256 ENDP PUBLIC lshift_mod_256 ALIGN 32 lshift_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_lshift_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 $L$SEH_body_lshift_mod_256:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] $L$oop_lshift_mod_256:: call __lshift_mod_256 dec edx jnz $L$oop_lshift_mod_256 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov r12,QWORD PTR[rsp] mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_lshift_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_lshift_mod_256:: lshift_mod_256 ENDP PUBLIC rshift_mod_256 ALIGN 32 rshift_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_rshift_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx sub rsp,8 $L$SEH_body_rshift_mod_256:: ifdef __SGX_LVI_HARDENING__ lfence endif mov rbp,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] $L$oop_rshift_mod_256:: mov r8,rbp and rbp,1 mov rax,QWORD PTR[rcx] neg rbp mov rsi,QWORD PTR[8+rcx] mov rbx,QWORD PTR[16+rcx] and rax,rbp and rsi,rbp and rbx,rbp and rbp,QWORD PTR[24+rcx] add r8,rax adc r9,rsi adc r10,rbx adc r11,rbp sbb rax,rax shr r8,1 mov rbp,r9 shr r9,1 mov rbx,r10 shr r10,1 mov rsi,r11 shr r11,1 shl rbp,63 shl rbx,63 or rbp,r8 shl rsi,63 or r9,rbx shl rax,63 or r10,rsi or r11,rax dec edx jnz $L$oop_rshift_mod_256 mov QWORD PTR[rdi],rbp mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_rshift_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_rshift_mod_256:: rshift_mod_256 ENDP PUBLIC cneg_mod_256 ALIGN 32 cneg_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_cneg_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 $L$SEH_body_cneg_mod_256:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r12,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r8,r12 mov r11,QWORD PTR[24+rsi] or r12,r9 or r12,r10 or r12,r11 mov rbp,-1 mov rax,QWORD PTR[rcx] cmovnz r12,rbp mov rsi,QWORD PTR[8+rcx] mov rbx,QWORD PTR[16+rcx] and rax,r12 mov rbp,QWORD PTR[24+rcx] and rsi,r12 and rbx,r12 and rbp,r12 sub rax,r8 sbb rsi,r9 sbb rbx,r10 sbb rbp,r11 or rdx,rdx cmovz rax,r8 cmovz rsi,r9 mov QWORD PTR[rdi],rax cmovz rbx,r10 mov QWORD PTR[8+rdi],rsi cmovz rbp,r11 mov QWORD PTR[16+rdi],rbx mov QWORD PTR[24+rdi],rbp mov r12,QWORD PTR[rsp] mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_cneg_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_cneg_mod_256:: cneg_mod_256 ENDP PUBLIC sub_mod_256 ALIGN 32 sub_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx sub rsp,8 $L$SEH_body_sub_mod_256:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] sub r8,QWORD PTR[rdx] mov rax,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rdx] mov rsi,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rdx] mov rbx,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rdx] mov rbp,QWORD PTR[24+rcx] sbb rdx,rdx and rax,rdx and rsi,rdx and rbx,rdx and rbp,rdx add r8,rax adc r9,rsi mov QWORD PTR[rdi],r8 adc r10,rbx mov QWORD PTR[8+rdi],r9 adc r11,rbp mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_sub_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sub_mod_256:: sub_mod_256 ENDP PUBLIC check_mod_256 ALIGN 32 check_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_check_mod_256:: mov rdi,rcx mov rsi,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov rax,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] mov r10,QWORD PTR[16+rdi] mov r11,QWORD PTR[24+rdi] mov r8,rax or rax,r9 or rax,r10 or rax,r11 sub r8,QWORD PTR[rsi] sbb r9,QWORD PTR[8+rsi] sbb r10,QWORD PTR[16+rsi] sbb r11,QWORD PTR[24+rsi] sbb rsi,rsi mov rdx,1 cmp rax,0 cmovne rax,rdx and rax,rsi $L$SEH_epilogue_check_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_check_mod_256:: check_mod_256 ENDP PUBLIC add_n_check_mod_256 ALIGN 32 add_n_check_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_n_check_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx sub rsp,8 $L$SEH_body_add_n_check_mod_256:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] add r8,QWORD PTR[rdx] adc r9,QWORD PTR[8+rdx] mov rax,r8 adc r10,QWORD PTR[16+rdx] mov rsi,r9 adc r11,QWORD PTR[24+rdx] sbb rdx,rdx mov rbx,r10 sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rcx] mov rbp,r11 sbb r11,QWORD PTR[24+rcx] sbb rdx,0 cmovc r8,rax cmovc r9,rsi mov QWORD PTR[rdi],r8 cmovc r10,rbx mov QWORD PTR[8+rdi],r9 cmovc r11,rbp mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 or r8,r9 or r10,r11 or r8,r10 mov rax,1 cmovz rax,r8 mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_add_n_check_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_add_n_check_mod_256:: add_n_check_mod_256 ENDP PUBLIC sub_n_check_mod_256 ALIGN 32 sub_n_check_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_n_check_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx sub rsp,8 $L$SEH_body_sub_n_check_mod_256:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] sub r8,QWORD PTR[rdx] mov rax,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rdx] mov rsi,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rdx] mov rbx,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rdx] mov rbp,QWORD PTR[24+rcx] sbb rdx,rdx and rax,rdx and rsi,rdx and rbx,rdx and rbp,rdx add r8,rax adc r9,rsi mov QWORD PTR[rdi],r8 adc r10,rbx mov QWORD PTR[8+rdi],r9 adc r11,rbp mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 or r8,r9 or r10,r11 or r8,r10 mov rax,1 cmovz rax,r8 mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_sub_n_check_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sub_n_check_mod_256:: sub_n_check_mod_256 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_add_mod_256 DD imagerel $L$SEH_body_add_mod_256 DD imagerel $L$SEH_info_add_mod_256_prologue DD imagerel $L$SEH_body_add_mod_256 DD imagerel $L$SEH_epilogue_add_mod_256 DD imagerel $L$SEH_info_add_mod_256_body DD imagerel $L$SEH_epilogue_add_mod_256 DD imagerel $L$SEH_end_add_mod_256 DD imagerel $L$SEH_info_add_mod_256_epilogue DD imagerel $L$SEH_begin_mul_by_3_mod_256 DD imagerel $L$SEH_body_mul_by_3_mod_256 DD imagerel $L$SEH_info_mul_by_3_mod_256_prologue DD imagerel $L$SEH_body_mul_by_3_mod_256 DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 DD imagerel $L$SEH_info_mul_by_3_mod_256_body DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 DD imagerel $L$SEH_end_mul_by_3_mod_256 DD imagerel $L$SEH_info_mul_by_3_mod_256_epilogue DD imagerel $L$SEH_begin_lshift_mod_256 DD imagerel $L$SEH_body_lshift_mod_256 DD imagerel $L$SEH_info_lshift_mod_256_prologue DD imagerel $L$SEH_body_lshift_mod_256 DD imagerel $L$SEH_epilogue_lshift_mod_256 DD imagerel $L$SEH_info_lshift_mod_256_body DD imagerel $L$SEH_epilogue_lshift_mod_256 DD imagerel $L$SEH_end_lshift_mod_256 DD imagerel $L$SEH_info_lshift_mod_256_epilogue DD imagerel $L$SEH_begin_rshift_mod_256 DD imagerel $L$SEH_body_rshift_mod_256 DD imagerel $L$SEH_info_rshift_mod_256_prologue DD imagerel $L$SEH_body_rshift_mod_256 DD imagerel $L$SEH_epilogue_rshift_mod_256 DD imagerel $L$SEH_info_rshift_mod_256_body DD imagerel $L$SEH_epilogue_rshift_mod_256 DD imagerel $L$SEH_end_rshift_mod_256 DD imagerel $L$SEH_info_rshift_mod_256_epilogue DD imagerel $L$SEH_begin_cneg_mod_256 DD imagerel $L$SEH_body_cneg_mod_256 DD imagerel $L$SEH_info_cneg_mod_256_prologue DD imagerel $L$SEH_body_cneg_mod_256 DD imagerel $L$SEH_epilogue_cneg_mod_256 DD imagerel $L$SEH_info_cneg_mod_256_body DD imagerel $L$SEH_epilogue_cneg_mod_256 DD imagerel $L$SEH_end_cneg_mod_256 DD imagerel $L$SEH_info_cneg_mod_256_epilogue DD imagerel $L$SEH_begin_sub_mod_256 DD imagerel $L$SEH_body_sub_mod_256 DD imagerel $L$SEH_info_sub_mod_256_prologue DD imagerel $L$SEH_body_sub_mod_256 DD imagerel $L$SEH_epilogue_sub_mod_256 DD imagerel $L$SEH_info_sub_mod_256_body DD imagerel $L$SEH_epilogue_sub_mod_256 DD imagerel $L$SEH_end_sub_mod_256 DD imagerel $L$SEH_info_sub_mod_256_epilogue DD imagerel $L$SEH_epilogue_check_mod_256 DD imagerel $L$SEH_end_check_mod_256 DD imagerel $L$SEH_info_check_mod_256_epilogue DD imagerel $L$SEH_begin_add_n_check_mod_256 DD imagerel $L$SEH_body_add_n_check_mod_256 DD imagerel $L$SEH_info_add_n_check_mod_256_prologue DD imagerel $L$SEH_body_add_n_check_mod_256 DD imagerel $L$SEH_epilogue_add_n_check_mod_256 DD imagerel $L$SEH_info_add_n_check_mod_256_body DD imagerel $L$SEH_epilogue_add_n_check_mod_256 DD imagerel $L$SEH_end_add_n_check_mod_256 DD imagerel $L$SEH_info_add_n_check_mod_256_epilogue DD imagerel $L$SEH_begin_sub_n_check_mod_256 DD imagerel $L$SEH_body_sub_n_check_mod_256 DD imagerel $L$SEH_info_sub_n_check_mod_256_prologue DD imagerel $L$SEH_body_sub_n_check_mod_256 DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 DD imagerel $L$SEH_info_sub_n_check_mod_256_body DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 DD imagerel $L$SEH_end_sub_n_check_mod_256 DD imagerel $L$SEH_info_sub_n_check_mod_256_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_add_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_add_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_add_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_by_3_mod_256_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_lshift_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_lshift_mod_256_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h $L$SEH_info_lshift_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_rshift_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_rshift_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_rshift_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_cneg_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_cneg_mod_256_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h $L$SEH_info_cneg_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sub_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_check_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_add_n_check_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_add_n_check_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_add_n_check_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_n_check_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sub_n_check_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_n_check_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/add_mod_384-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |add_mod_384|[FUNC] ALIGN 32 |add_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__add_mod_384| PROC ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] |__add_mod_384_ab_are_loaded| adds x10,x10,x16 adcs x11,x11,x17 adcs x12,x12,x19 adcs x13,x13,x20 adcs x14,x14,x21 adcs x15,x15,x22 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csello x10,x10,x16 csello x11,x11,x17 csello x12,x12,x19 csello x13,x13,x20 csello x14,x14,x21 csello x15,x15,x22 ret ENDP EXPORT |add_mod_384x|[FUNC] ALIGN 32 |add_mod_384x| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __add_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __add_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |rshift_mod_384|[FUNC] ALIGN 32 |rshift_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] |$Loop_rshift_mod_384| sub x2,x2,#1 bl __rshift_mod_384 cbnz x2,|$Loop_rshift_mod_384| ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__rshift_mod_384| PROC sbfx x22,x10,#0,#1 and x16,x22,x4 and x17,x22,x5 adds x10,x10,x16 and x19,x22,x6 adcs x11,x11,x17 and x20,x22,x7 adcs x12,x12,x19 and x21,x22,x8 adcs x13,x13,x20 and x22,x22,x9 adcs x14,x14,x21 extr x10,x11,x10,#1 adcs x15,x15,x22 extr x11,x12,x11,#1 adc x22,xzr,xzr extr x12,x13,x12,#1 extr x13,x14,x13,#1 extr x14,x15,x14,#1 extr x15,x22,x15,#1 ret ENDP EXPORT |div_by_2_mod_384|[FUNC] ALIGN 32 |div_by_2_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __rshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |lshift_mod_384|[FUNC] ALIGN 32 |lshift_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] |$Loop_lshift_mod_384| sub x2,x2,#1 bl __lshift_mod_384 cbnz x2,|$Loop_lshift_mod_384| ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__lshift_mod_384| PROC adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x16,x10,x4 sbcs x17,x11,x5 sbcs x19,x12,x6 sbcs x20,x13,x7 sbcs x21,x14,x8 sbcs x22,x15,x9 sbcs xzr,x3,xzr csello x10,x10,x16 csello x11,x11,x17 csello x12,x12,x19 csello x13,x13,x20 csello x14,x14,x21 csello x15,x15,x22 ret ENDP EXPORT |mul_by_3_mod_384|[FUNC] ALIGN 32 |mul_by_3_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |mul_by_8_mod_384|[FUNC] ALIGN 32 |mul_by_8_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |mul_by_3_mod_384x|[FUNC] ALIGN 32 |mul_by_3_mod_384x| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] bl __add_mod_384_ab_are_loaded stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 ldp x16,x17,[x1,#48] ldp x19,x20,[x1,#64] ldp x21,x22,[x1,#80] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |mul_by_8_mod_384x|[FUNC] ALIGN 32 |mul_by_8_mod_384x| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x12,x13,[x1,#16] ldp x14,x15,[x1,#32] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |cneg_mod_384|[FUNC] ALIGN 32 |cneg_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldp x4,x5,[x3] ldp x12,x13,[x1,#16] ldp x6,x7,[x3,#16] subs x16,x4,x10 ldp x14,x15,[x1,#32] ldp x8,x9,[x3,#32] orr x3,x10,x11 sbcs x17,x5,x11 orr x3,x3,x12 sbcs x19,x6,x12 orr x3,x3,x13 sbcs x20,x7,x13 orr x3,x3,x14 sbcs x21,x8,x14 orr x3,x3,x15 sbc x22,x9,x15 cmp x3,#0 csetmne x3 ands x2,x2,x3 cseleq x10,x10,x16 cseleq x11,x11,x17 cseleq x12,x12,x19 cseleq x13,x13,x20 stp x10,x11,[x0] cseleq x14,x14,x21 stp x12,x13,[x0,#16] cseleq x15,x15,x22 stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |sub_mod_384|[FUNC] ALIGN 32 |sub_mod_384| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0] stp x12,x13,[x0,#16] stp x14,x15,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__sub_mod_384| PROC ldp x10,x11,[x1] ldp x16,x17,[x2] ldp x12,x13,[x1,#16] ldp x19,x20,[x2,#16] ldp x14,x15,[x1,#32] ldp x21,x22,[x2,#32] subs x10,x10,x16 sbcs x11,x11,x17 sbcs x12,x12,x19 sbcs x13,x13,x20 sbcs x14,x14,x21 sbcs x15,x15,x22 sbc x3,xzr,xzr and x16,x4,x3 and x17,x5,x3 adds x10,x10,x16 and x19,x6,x3 adcs x11,x11,x17 and x20,x7,x3 adcs x12,x12,x19 and x21,x8,x3 adcs x13,x13,x20 and x22,x9,x3 adcs x14,x14,x21 adc x15,x15,x22 ret ENDP EXPORT |sub_mod_384x|[FUNC] ALIGN 32 |sub_mod_384x| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x3] ldp x6,x7,[x3,#16] ldp x8,x9,[x3,#32] bl __sub_mod_384 stp x10,x11,[x0] add x1,x1,#48 stp x12,x13,[x0,#16] add x2,x2,#48 stp x14,x15,[x0,#32] bl __sub_mod_384 ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |mul_by_1_plus_i_mod_384x|[FUNC] ALIGN 32 |mul_by_1_plus_i_mod_384x| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x4,x5,[x2] ldp x6,x7,[x2,#16] ldp x8,x9,[x2,#32] add x2,x1,#48 bl __sub_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] ldp x21,x22,[x1,#32] stp x10,x11,[x0] ldp x10,x11,[x1,#48] stp x12,x13,[x0,#16] ldp x12,x13,[x1,#64] stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#__SIZEOF_POINTER__] stp x10,x11,[x0,#48] stp x12,x13,[x0,#64] stp x14,x15,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |sgn0_pty_mod_384|[FUNC] ALIGN 32 |sgn0_pty_mod_384| PROC hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x0,x10,#1 adds x10,x10,x10 adcs x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adc x3,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x3,x3,xzr mvn x3,x3 and x3,x3,#2 orr x0,x0,x3 ret ENDP EXPORT |sgn0_pty_mod_384x|[FUNC] ALIGN 32 |sgn0_pty_mod_384x| PROC hint #34 ldp x10,x11,[x0] ldp x12,x13,[x0,#16] ldp x14,x15,[x0,#32] ldp x4,x5,[x1] ldp x6,x7,[x1,#16] ldp x8,x9,[x1,#32] and x2,x10,#1 orr x3,x10,x11 adds x10,x10,x10 orr x3,x3,x12 adcs x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr ldp x10,x11,[x0,#48] ldp x12,x13,[x0,#64] ldp x14,x15,[x0,#80] mvn x16,x16 and x16,x16,#2 orr x2,x2,x16 and x0,x10,#1 orr x1,x10,x11 adds x10,x10,x10 orr x1,x1,x12 adcs x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 adcs x15,x15,x15 adc x16,xzr,xzr subs x10,x10,x4 sbcs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbc x16,x16,xzr mvn x16,x16 and x16,x16,#2 orr x0,x0,x16 cmp x3,#0 cseleq x3,x0,x2 cmp x1,#0 cselne x1,x0,x2 and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 ret ENDP EXPORT |vec_select_32|[FUNC] ALIGN 32 |vec_select_32| PROC hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d}, [x1] cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d}, [x2] bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b st1 {v0.2d, v1.2d}, [x0] ret ENDP EXPORT |vec_select_48|[FUNC] ALIGN 32 |vec_select_48| PROC hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret ENDP EXPORT |vec_select_96|[FUNC] ALIGN 32 |vec_select_96| PROC hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret ENDP EXPORT |vec_select_192|[FUNC] ALIGN 32 |vec_select_192| PROC hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret ENDP EXPORT |vec_select_144|[FUNC] ALIGN 32 |vec_select_144| PROC hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0] ret ENDP EXPORT |vec_select_288|[FUNC] ALIGN 32 |vec_select_288| PROC hint #34 dup v6.2d, x3 ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 bit v17.16b, v20.16b, v6.16b ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 bit v0.16b, v3.16b, v6.16b ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 bit v1.16b, v4.16b, v6.16b ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 bit v2.16b, v5.16b, v6.16b st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 bit v16.16b, v19.16b, v6.16b bit v17.16b, v20.16b, v6.16b bit v18.16b, v21.16b, v6.16b st1 {v16.2d, v17.2d, v18.2d}, [x0] ret ENDP EXPORT |vec_prefetch|[FUNC] ALIGN 32 |vec_prefetch| PROC hint #34 add x1, x1, x0 sub x1, x1, #1 mov x2, #64 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 cselhi x0,x1,x0 cselhi x2,xzr,x2 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 cselhi x0,x1,x0 cselhi x2,xzr,x2 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 cselhi x0,x1,x0 cselhi x2,xzr,x2 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 cselhi x0,x1,x0 cselhi x2,xzr,x2 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 cselhi x0,x1,x0 cselhi x2,xzr,x2 prfm pldl1keep, [x0] add x0, x0, x2 cmp x0, x1 cselhi x0,x1,x0 prfm pldl1keep, [x0] ret ENDP EXPORT |vec_is_zero_16x|[FUNC] ALIGN 32 |vec_is_zero_16x| PROC hint #34 ld1 {v0.2d}, [x0], #16 lsr x1, x1, #4 sub x1, x1, #1 cbz x1, |$Loop_is_zero_done| |$Loop_is_zero| ld1 {v1.2d}, [x0], #16 orr v0.16b, v0.16b, v1.16b sub x1, x1, #1 cbnz x1, |$Loop_is_zero| |$Loop_is_zero_done| dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 cseleq x0,x0,xzr ret ENDP EXPORT |vec_is_equal_16x|[FUNC] ALIGN 32 |vec_is_equal_16x| PROC hint #34 ld1 {v0.2d}, [x0], #16 ld1 {v1.2d}, [x1], #16 lsr x2, x2, #4 eor v0.16b, v0.16b, v1.16b |$Loop_is_equal| sub x2, x2, #1 cbz x2, |$Loop_is_equal_done| ld1 {v1.2d}, [x0], #16 ld1 {v2.2d}, [x1], #16 eor v1.16b, v1.16b, v2.16b orr v0.16b, v0.16b, v1.16b b |$Loop_is_equal| nop |$Loop_is_equal_done| dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.d[0] mov x0, #1 cmp x1, #0 cseleq x0,x0,xzr ret ENDP END ================================================ FILE: build/win64/add_mod_384-x86_64.asm ================================================ OPTION DOTNAME .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC add_mod_384 ALIGN 32 add_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_add_mod_384:: call __add_mod_384 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_add_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_add_mod_384:: add_mod_384 ENDP ALIGN 32 __add_mod_384 PROC PRIVATE DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] __add_mod_384_a_is_loaded:: add r8,QWORD PTR[rdx] adc r9,QWORD PTR[8+rdx] adc r10,QWORD PTR[16+rdx] mov r14,r8 adc r11,QWORD PTR[24+rdx] mov r15,r9 adc r12,QWORD PTR[32+rdx] mov rax,r10 adc r13,QWORD PTR[40+rdx] mov rbx,r11 sbb rdx,rdx sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] mov rbp,r12 sbb r10,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rcx] mov rsi,r13 sbb r13,QWORD PTR[40+rcx] sbb rdx,0 cmovc r8,r14 cmovc r9,r15 cmovc r10,rax mov QWORD PTR[rdi],r8 cmovc r11,rbx mov QWORD PTR[8+rdi],r9 cmovc r12,rbp mov QWORD PTR[16+rdi],r10 cmovc r13,rsi mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __add_mod_384 ENDP PUBLIC add_mod_384x ALIGN 32 add_mod_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_384x:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 sub rsp,24 $L$SEH_body_add_mod_384x:: mov QWORD PTR[rsp],rsi mov QWORD PTR[8+rsp],rdx lea rsi,QWORD PTR[48+rsi] lea rdx,QWORD PTR[48+rdx] lea rdi,QWORD PTR[48+rdi] call __add_mod_384 mov rsi,QWORD PTR[rsp] mov rdx,QWORD PTR[8+rsp] lea rdi,QWORD PTR[((-48))+rdi] call __add_mod_384 mov r15,QWORD PTR[((24+0))+rsp] mov r14,QWORD PTR[((24+8))+rsp] mov r13,QWORD PTR[((24+16))+rsp] mov r12,QWORD PTR[((24+24))+rsp] mov rbx,QWORD PTR[((24+32))+rsp] mov rbp,QWORD PTR[((24+40))+rsp] lea rsp,QWORD PTR[((24+48))+rsp] $L$SEH_epilogue_add_mod_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_add_mod_384x:: add_mod_384x ENDP PUBLIC rshift_mod_384 ALIGN 32 rshift_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_rshift_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 push rdi $L$SEH_body_rshift_mod_384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] $L$oop_rshift_mod_384:: call __rshift_mod_384 dec edx jnz $L$oop_rshift_mod_384 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_rshift_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_rshift_mod_384:: rshift_mod_384 ENDP ALIGN 32 __rshift_mod_384 PROC PRIVATE DB 243,15,30,250 mov rsi,1 mov r14,QWORD PTR[rcx] and rsi,r8 mov r15,QWORD PTR[8+rcx] neg rsi mov rax,QWORD PTR[16+rcx] and r14,rsi mov rbx,QWORD PTR[24+rcx] and r15,rsi mov rbp,QWORD PTR[32+rcx] and rax,rsi and rbx,rsi and rbp,rsi and rsi,QWORD PTR[40+rcx] add r14,r8 adc r15,r9 adc rax,r10 adc rbx,r11 adc rbp,r12 adc rsi,r13 sbb r13,r13 shr r14,1 mov r8,r15 shr r15,1 mov r9,rax shr rax,1 mov r10,rbx shr rbx,1 mov r11,rbp shr rbp,1 mov r12,rsi shr rsi,1 shl r8,63 shl r9,63 or r8,r14 shl r10,63 or r9,r15 shl r11,63 or r10,rax shl r12,63 or r11,rbx shl r13,63 or r12,rbp or r13,rsi ifdef __SGX_LVI_HARDENING__ pop r14 lfence jmp r14 ud2 else DB 0F3h,0C3h endif __rshift_mod_384 ENDP PUBLIC div_by_2_mod_384 ALIGN 32 div_by_2_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_div_by_2_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 push rbx push r12 push r13 push r14 push r15 push rdi $L$SEH_body_div_by_2_mod_384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov rcx,rdx mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] call __rshift_mod_384 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_div_by_2_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_div_by_2_mod_384:: div_by_2_mod_384 ENDP PUBLIC lshift_mod_384 ALIGN 32 lshift_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_lshift_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 push rdi $L$SEH_body_lshift_mod_384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] $L$oop_lshift_mod_384:: add r8,r8 adc r9,r9 adc r10,r10 mov r14,r8 adc r11,r11 mov r15,r9 adc r12,r12 mov rax,r10 adc r13,r13 mov rbx,r11 sbb rdi,rdi sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] mov rbp,r12 sbb r10,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rcx] mov rsi,r13 sbb r13,QWORD PTR[40+rcx] sbb rdi,0 mov rdi,QWORD PTR[rsp] cmovc r8,r14 cmovc r9,r15 cmovc r10,rax cmovc r11,rbx cmovc r12,rbp cmovc r13,rsi dec edx jnz $L$oop_lshift_mod_384 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_lshift_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_lshift_mod_384:: lshift_mod_384 ENDP ALIGN 32 __lshift_mod_384 PROC PRIVATE DB 243,15,30,250 add r8,r8 adc r9,r9 adc r10,r10 mov r14,r8 adc r11,r11 mov r15,r9 adc r12,r12 mov rax,r10 adc r13,r13 mov rbx,r11 sbb rdx,rdx sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] mov rbp,r12 sbb r10,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rcx] mov rsi,r13 sbb r13,QWORD PTR[40+rcx] sbb rdx,0 cmovc r8,r14 cmovc r9,r15 cmovc r10,rax cmovc r11,rbx cmovc r12,rbp cmovc r13,rsi ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __lshift_mod_384 ENDP PUBLIC mul_by_3_mod_384 ALIGN 32 mul_by_3_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_3_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 push rbx push r12 push r13 push r14 push r15 push rsi $L$SEH_body_mul_by_3_mod_384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rcx,rdx call __lshift_mod_384 mov rdx,QWORD PTR[rsp] ifdef __SGX_LVI_HARDENING__ lfence endif call __add_mod_384_a_is_loaded mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_mul_by_3_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_by_3_mod_384:: mul_by_3_mod_384 ENDP PUBLIC mul_by_8_mod_384 ALIGN 32 mul_by_8_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_8_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_mul_by_8_mod_384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rcx,rdx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_mul_by_8_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_by_8_mod_384:: mul_by_8_mod_384 ENDP PUBLIC mul_by_3_mod_384x ALIGN 32 mul_by_3_mod_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_3_mod_384x:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 push rbx push r12 push r13 push r14 push r15 push rsi $L$SEH_body_mul_by_3_mod_384x:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rcx,rdx call __lshift_mod_384 mov rdx,QWORD PTR[rsp] ifdef __SGX_LVI_HARDENING__ lfence endif call __add_mod_384_a_is_loaded mov rsi,QWORD PTR[rsp] lea rdi,QWORD PTR[48+rdi] ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[48+rsi] mov r9,QWORD PTR[56+rsi] mov r10,QWORD PTR[64+rsi] mov r11,QWORD PTR[72+rsi] mov r12,QWORD PTR[80+rsi] mov r13,QWORD PTR[88+rsi] call __lshift_mod_384 mov rdx,8*6 add rdx,QWORD PTR[rsp] ifdef __SGX_LVI_HARDENING__ lfence endif call __add_mod_384_a_is_loaded mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_mul_by_3_mod_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_by_3_mod_384x:: mul_by_3_mod_384x ENDP PUBLIC mul_by_8_mod_384x ALIGN 32 mul_by_8_mod_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_8_mod_384x:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 push rbx push r12 push r13 push r14 push r15 push rsi $L$SEH_body_mul_by_8_mod_384x:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rcx,rdx call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 mov rsi,QWORD PTR[rsp] mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[((48+0))+rsi] mov r9,QWORD PTR[((48+8))+rsi] mov r10,QWORD PTR[((48+16))+rsi] mov r11,QWORD PTR[((48+24))+rsi] mov r12,QWORD PTR[((48+32))+rsi] mov r13,QWORD PTR[((48+40))+rsi] call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 mov QWORD PTR[((48+0))+rdi],r8 mov QWORD PTR[((48+8))+rdi],r9 mov QWORD PTR[((48+16))+rdi],r10 mov QWORD PTR[((48+24))+rdi],r11 mov QWORD PTR[((48+32))+rdi],r12 mov QWORD PTR[((48+40))+rdi],r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_mul_by_8_mod_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_by_8_mod_384x:: mul_by_8_mod_384x ENDP PUBLIC cneg_mod_384 ALIGN 32 cneg_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_cneg_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 push rdx $L$SEH_body_cneg_mod_384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r8,rdx mov r11,QWORD PTR[24+rsi] or rdx,r9 mov r12,QWORD PTR[32+rsi] or rdx,r10 mov r13,QWORD PTR[40+rsi] or rdx,r11 mov rsi,-1 or rdx,r12 or rdx,r13 mov r14,QWORD PTR[rcx] cmovnz rdx,rsi mov r15,QWORD PTR[8+rcx] mov rax,QWORD PTR[16+rcx] and r14,rdx mov rbx,QWORD PTR[24+rcx] and r15,rdx mov rbp,QWORD PTR[32+rcx] and rax,rdx mov rsi,QWORD PTR[40+rcx] and rbx,rdx mov rcx,QWORD PTR[rsp] and rbp,rdx and rsi,rdx sub r14,r8 sbb r15,r9 sbb rax,r10 sbb rbx,r11 sbb rbp,r12 sbb rsi,r13 or rcx,rcx cmovz r14,r8 cmovz r15,r9 cmovz rax,r10 mov QWORD PTR[rdi],r14 cmovz rbx,r11 mov QWORD PTR[8+rdi],r15 cmovz rbp,r12 mov QWORD PTR[16+rdi],rax cmovz rsi,r13 mov QWORD PTR[24+rdi],rbx mov QWORD PTR[32+rdi],rbp mov QWORD PTR[40+rdi],rsi mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_cneg_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_cneg_mod_384:: cneg_mod_384 ENDP PUBLIC sub_mod_384 ALIGN 32 sub_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sub_mod_384:: call __sub_mod_384 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sub_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sub_mod_384:: sub_mod_384 ENDP ALIGN 32 __sub_mod_384 PROC PRIVATE DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] sub r8,QWORD PTR[rdx] mov r14,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rdx] mov r15,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rdx] mov rax,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rdx] mov rbx,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rdx] mov rbp,QWORD PTR[32+rcx] sbb r13,QWORD PTR[40+rdx] mov rsi,QWORD PTR[40+rcx] sbb rdx,rdx and r14,rdx and r15,rdx and rax,rdx and rbx,rdx and rbp,rdx and rsi,rdx add r8,r14 adc r9,r15 mov QWORD PTR[rdi],r8 adc r10,rax mov QWORD PTR[8+rdi],r9 adc r11,rbx mov QWORD PTR[16+rdi],r10 adc r12,rbp mov QWORD PTR[24+rdi],r11 adc r13,rsi mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __sub_mod_384 ENDP PUBLIC sub_mod_384x ALIGN 32 sub_mod_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_384x:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 sub rsp,24 $L$SEH_body_sub_mod_384x:: mov QWORD PTR[rsp],rsi mov QWORD PTR[8+rsp],rdx lea rsi,QWORD PTR[48+rsi] lea rdx,QWORD PTR[48+rdx] lea rdi,QWORD PTR[48+rdi] call __sub_mod_384 mov rsi,QWORD PTR[rsp] mov rdx,QWORD PTR[8+rsp] lea rdi,QWORD PTR[((-48))+rdi] call __sub_mod_384 mov r15,QWORD PTR[((24+0))+rsp] mov r14,QWORD PTR[((24+8))+rsp] mov r13,QWORD PTR[((24+16))+rsp] mov r12,QWORD PTR[((24+24))+rsp] mov rbx,QWORD PTR[((24+32))+rsp] mov rbp,QWORD PTR[((24+40))+rsp] lea rsp,QWORD PTR[((24+48))+rsp] $L$SEH_epilogue_sub_mod_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sub_mod_384x:: sub_mod_384x ENDP PUBLIC mul_by_1_plus_i_mod_384x ALIGN 32 mul_by_1_plus_i_mod_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_1_plus_i_mod_384x:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 push rbx push r12 push r13 push r14 push r15 sub rsp,56 $L$SEH_body_mul_by_1_plus_i_mod_384x:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,r8 add r8,QWORD PTR[48+rsi] mov r15,r9 adc r9,QWORD PTR[56+rsi] mov rax,r10 adc r10,QWORD PTR[64+rsi] mov rbx,r11 adc r11,QWORD PTR[72+rsi] mov rcx,r12 adc r12,QWORD PTR[80+rsi] mov rbp,r13 adc r13,QWORD PTR[88+rsi] mov QWORD PTR[48+rsp],rdi sbb rdi,rdi sub r14,QWORD PTR[48+rsi] sbb r15,QWORD PTR[56+rsi] sbb rax,QWORD PTR[64+rsi] sbb rbx,QWORD PTR[72+rsi] sbb rcx,QWORD PTR[80+rsi] sbb rbp,QWORD PTR[88+rsi] sbb rsi,rsi mov QWORD PTR[rsp],r8 mov r8,QWORD PTR[rdx] mov QWORD PTR[8+rsp],r9 mov r9,QWORD PTR[8+rdx] mov QWORD PTR[16+rsp],r10 mov r10,QWORD PTR[16+rdx] mov QWORD PTR[24+rsp],r11 mov r11,QWORD PTR[24+rdx] mov QWORD PTR[32+rsp],r12 and r8,rsi mov r12,QWORD PTR[32+rdx] mov QWORD PTR[40+rsp],r13 and r9,rsi mov r13,QWORD PTR[40+rdx] and r10,rsi and r11,rsi and r12,rsi and r13,rsi mov rsi,QWORD PTR[48+rsp] add r14,r8 mov r8,QWORD PTR[rsp] adc r15,r9 mov r9,QWORD PTR[8+rsp] adc rax,r10 mov r10,QWORD PTR[16+rsp] adc rbx,r11 mov r11,QWORD PTR[24+rsp] adc rcx,r12 mov r12,QWORD PTR[32+rsp] adc rbp,r13 mov r13,QWORD PTR[40+rsp] mov QWORD PTR[rsi],r14 mov r14,r8 mov QWORD PTR[8+rsi],r15 mov QWORD PTR[16+rsi],rax mov r15,r9 mov QWORD PTR[24+rsi],rbx mov QWORD PTR[32+rsi],rcx mov rax,r10 mov QWORD PTR[40+rsi],rbp sub r8,QWORD PTR[rdx] mov rbx,r11 sbb r9,QWORD PTR[8+rdx] sbb r10,QWORD PTR[16+rdx] mov rcx,r12 sbb r11,QWORD PTR[24+rdx] sbb r12,QWORD PTR[32+rdx] mov rbp,r13 sbb r13,QWORD PTR[40+rdx] sbb rdi,0 cmovc r8,r14 cmovc r9,r15 cmovc r10,rax mov QWORD PTR[48+rsi],r8 cmovc r11,rbx mov QWORD PTR[56+rsi],r9 cmovc r12,rcx mov QWORD PTR[64+rsi],r10 cmovc r13,rbp mov QWORD PTR[72+rsi],r11 mov QWORD PTR[80+rsi],r12 mov QWORD PTR[88+rsi],r13 mov r15,QWORD PTR[((56+0))+rsp] mov r14,QWORD PTR[((56+8))+rsp] mov r13,QWORD PTR[((56+16))+rsp] mov r12,QWORD PTR[((56+24))+rsp] mov rbx,QWORD PTR[((56+32))+rsp] mov rbp,QWORD PTR[((56+40))+rsp] lea rsp,QWORD PTR[((56+48))+rsp] $L$SEH_epilogue_mul_by_1_plus_i_mod_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_by_1_plus_i_mod_384x:: mul_by_1_plus_i_mod_384x ENDP PUBLIC sgn0_pty_mod_384 ALIGN 32 sgn0_pty_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mod_384:: mov rdi,rcx mov rsi,rdx $L$SEH_body_sgn0_pty_mod_384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] mov r10,QWORD PTR[16+rdi] mov r11,QWORD PTR[24+rdi] mov rcx,QWORD PTR[32+rdi] mov rdx,QWORD PTR[40+rdi] xor rax,rax mov rdi,r8 add r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 adc rcx,rcx adc rdx,rdx adc rax,0 sub r8,QWORD PTR[rsi] sbb r9,QWORD PTR[8+rsi] sbb r10,QWORD PTR[16+rsi] sbb r11,QWORD PTR[24+rsi] sbb rcx,QWORD PTR[32+rsi] sbb rdx,QWORD PTR[40+rsi] sbb rax,0 not rax and rdi,1 and rax,2 or rax,rdi $L$SEH_epilogue_sgn0_pty_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sgn0_pty_mod_384:: sgn0_pty_mod_384 ENDP PUBLIC sgn0_pty_mod_384x ALIGN 32 sgn0_pty_mod_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mod_384x:: push rbp mov rdi,rcx mov rsi,rdx push rbx sub rsp,8 $L$SEH_body_sgn0_pty_mod_384x:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[48+rdi] mov r9,QWORD PTR[56+rdi] mov r10,QWORD PTR[64+rdi] mov r11,QWORD PTR[72+rdi] mov rcx,QWORD PTR[80+rdi] mov rdx,QWORD PTR[88+rdi] mov rbx,r8 or r8,r9 or r8,r10 or r8,r11 or r8,rcx or r8,rdx lea rax,QWORD PTR[rdi] xor rdi,rdi mov rbp,rbx add rbx,rbx adc r9,r9 adc r10,r10 adc r11,r11 adc rcx,rcx adc rdx,rdx adc rdi,0 sub rbx,QWORD PTR[rsi] sbb r9,QWORD PTR[8+rsi] sbb r10,QWORD PTR[16+rsi] sbb r11,QWORD PTR[24+rsi] sbb rcx,QWORD PTR[32+rsi] sbb rdx,QWORD PTR[40+rsi] sbb rdi,0 mov QWORD PTR[rsp],r8 not rdi and rbp,1 and rdi,2 or rdi,rbp mov r8,QWORD PTR[rax] mov r9,QWORD PTR[8+rax] mov r10,QWORD PTR[16+rax] mov r11,QWORD PTR[24+rax] mov rcx,QWORD PTR[32+rax] mov rdx,QWORD PTR[40+rax] mov rbx,r8 or r8,r9 or r8,r10 or r8,r11 or r8,rcx or r8,rdx xor rax,rax mov rbp,rbx add rbx,rbx adc r9,r9 adc r10,r10 adc r11,r11 adc rcx,rcx adc rdx,rdx adc rax,0 sub rbx,QWORD PTR[rsi] sbb r9,QWORD PTR[8+rsi] sbb r10,QWORD PTR[16+rsi] sbb r11,QWORD PTR[24+rsi] sbb rcx,QWORD PTR[32+rsi] sbb rdx,QWORD PTR[40+rsi] sbb rax,0 mov rbx,QWORD PTR[rsp] not rax test r8,r8 cmovz rbp,rdi test rbx,rbx cmovnz rax,rdi and rbp,1 and rax,2 or rax,rbp mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_sgn0_pty_mod_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sgn0_pty_mod_384x:: sgn0_pty_mod_384x ENDP PUBLIC vec_select_32 ALIGN 32 vec_select_32 PROC PUBLIC DB 243,15,30,250 movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rdx] lea rdx,QWORD PTR[16+rdx] pcmpeqd xmm5,xmm4 movdqu xmm1,XMMWORD PTR[r8] lea r8,QWORD PTR[16+r8] pcmpeqd xmm4,xmm5 lea rcx,QWORD PTR[16+rcx] pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((0+16-16))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((0+16-16))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(0-16)+rcx],xmm0 pand xmm2,xmm4 pand xmm3,xmm5 por xmm2,xmm3 movdqu XMMWORD PTR[(16-16)+rcx],xmm2 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_select_32 ENDP PUBLIC vec_select_48 ALIGN 32 vec_select_48 PROC PUBLIC DB 243,15,30,250 movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rdx] lea rdx,QWORD PTR[24+rdx] pcmpeqd xmm5,xmm4 movdqu xmm1,XMMWORD PTR[r8] lea r8,QWORD PTR[24+r8] pcmpeqd xmm4,xmm5 lea rcx,QWORD PTR[24+rcx] pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((0+16-24))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((0+16-24))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(0-24)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((16+16-24))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((16+16-24))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(16-24)+rcx],xmm2 pand xmm0,xmm4 pand xmm1,xmm5 por xmm0,xmm1 movdqu XMMWORD PTR[(32-24)+rcx],xmm0 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_select_48 ENDP PUBLIC vec_select_96 ALIGN 32 vec_select_96 PROC PUBLIC DB 243,15,30,250 movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rdx] lea rdx,QWORD PTR[48+rdx] pcmpeqd xmm5,xmm4 movdqu xmm1,XMMWORD PTR[r8] lea r8,QWORD PTR[48+r8] pcmpeqd xmm4,xmm5 lea rcx,QWORD PTR[48+rcx] pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((0+16-48))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((0+16-48))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(0-48)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((16+16-48))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((16+16-48))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(16-48)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((32+16-48))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((32+16-48))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(32-48)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((48+16-48))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((48+16-48))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(48-48)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((64+16-48))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((64+16-48))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(64-48)+rcx],xmm0 pand xmm2,xmm4 pand xmm3,xmm5 por xmm2,xmm3 movdqu XMMWORD PTR[(80-48)+rcx],xmm2 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_select_96 ENDP PUBLIC vec_select_192 ALIGN 32 vec_select_192 PROC PUBLIC DB 243,15,30,250 movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rdx] lea rdx,QWORD PTR[96+rdx] pcmpeqd xmm5,xmm4 movdqu xmm1,XMMWORD PTR[r8] lea r8,QWORD PTR[96+r8] pcmpeqd xmm4,xmm5 lea rcx,QWORD PTR[96+rcx] pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((0+16-96))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((0+16-96))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(0-96)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((16+16-96))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((16+16-96))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(16-96)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((32+16-96))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((32+16-96))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(32-96)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((48+16-96))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((48+16-96))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(48-96)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((64+16-96))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((64+16-96))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(64-96)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((80+16-96))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((80+16-96))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(80-96)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((96+16-96))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((96+16-96))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(96-96)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((112+16-96))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((112+16-96))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(112-96)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((128+16-96))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((128+16-96))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(128-96)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((144+16-96))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((144+16-96))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(144-96)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((160+16-96))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((160+16-96))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(160-96)+rcx],xmm0 pand xmm2,xmm4 pand xmm3,xmm5 por xmm2,xmm3 movdqu XMMWORD PTR[(176-96)+rcx],xmm2 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_select_192 ENDP PUBLIC vec_select_144 ALIGN 32 vec_select_144 PROC PUBLIC DB 243,15,30,250 movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rdx] lea rdx,QWORD PTR[72+rdx] pcmpeqd xmm5,xmm4 movdqu xmm1,XMMWORD PTR[r8] lea r8,QWORD PTR[72+r8] pcmpeqd xmm4,xmm5 lea rcx,QWORD PTR[72+rcx] pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((0+16-72))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((0+16-72))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(0-72)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((16+16-72))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((16+16-72))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(16-72)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((32+16-72))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((32+16-72))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(32-72)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((48+16-72))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((48+16-72))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(48-72)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((64+16-72))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((64+16-72))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(64-72)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((80+16-72))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((80+16-72))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(80-72)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((96+16-72))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((96+16-72))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(96-72)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((112+16-72))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((112+16-72))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(112-72)+rcx],xmm2 pand xmm0,xmm4 pand xmm1,xmm5 por xmm0,xmm1 movdqu XMMWORD PTR[(128-72)+rcx],xmm0 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_select_144 ENDP PUBLIC vec_select_288 ALIGN 32 vec_select_288 PROC PUBLIC DB 243,15,30,250 movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rdx] lea rdx,QWORD PTR[144+rdx] pcmpeqd xmm5,xmm4 movdqu xmm1,XMMWORD PTR[r8] lea r8,QWORD PTR[144+r8] pcmpeqd xmm4,xmm5 lea rcx,QWORD PTR[144+rcx] pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((0+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((0+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(0-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((16+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((16+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(16-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((32+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((32+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(32-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((48+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((48+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(48-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((64+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((64+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(64-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((80+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((80+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(80-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((96+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((96+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(96-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((112+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((112+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(112-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((128+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((128+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(128-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((144+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((144+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(144-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((160+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((160+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(160-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((176+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((176+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(176-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((192+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((192+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(192-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((208+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((208+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(208-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((224+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((224+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(224-144)+rcx],xmm0 pand xmm2,xmm4 movdqu xmm0,XMMWORD PTR[((240+16-144))+rdx] pand xmm3,xmm5 movdqu xmm1,XMMWORD PTR[((240+16-144))+r8] por xmm2,xmm3 movdqu XMMWORD PTR[(240-144)+rcx],xmm2 pand xmm0,xmm4 movdqu xmm2,XMMWORD PTR[((256+16-144))+rdx] pand xmm1,xmm5 movdqu xmm3,XMMWORD PTR[((256+16-144))+r8] por xmm0,xmm1 movdqu XMMWORD PTR[(256-144)+rcx],xmm0 pand xmm2,xmm4 pand xmm3,xmm5 por xmm2,xmm3 movdqu XMMWORD PTR[(272-144)+rcx],xmm2 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_select_288 ENDP PUBLIC vec_prefetch ALIGN 32 vec_prefetch PROC PUBLIC DB 243,15,30,250 lea rdx,QWORD PTR[((-1))+rdx*1+rcx] mov rax,64 xor r8,r8 ifdef __SGX_LVI_HARDENING__ lfence endif prefetchnta [rcx] lea rcx,QWORD PTR[rax*1+rcx] cmp rcx,rdx cmova rcx,rdx cmova rax,r8 prefetchnta [rcx] lea rcx,QWORD PTR[rax*1+rcx] cmp rcx,rdx cmova rcx,rdx cmova rax,r8 prefetchnta [rcx] lea rcx,QWORD PTR[rax*1+rcx] cmp rcx,rdx cmova rcx,rdx cmova rax,r8 prefetchnta [rcx] lea rcx,QWORD PTR[rax*1+rcx] cmp rcx,rdx cmova rcx,rdx cmova rax,r8 prefetchnta [rcx] lea rcx,QWORD PTR[rax*1+rcx] cmp rcx,rdx cmova rcx,rdx cmova rax,r8 prefetchnta [rcx] lea rcx,QWORD PTR[rax*1+rcx] cmp rcx,rdx cmova rcx,rdx prefetchnta [rcx] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_prefetch ENDP PUBLIC vec_is_zero_16x ALIGN 32 vec_is_zero_16x PROC PUBLIC DB 243,15,30,250 shr edx,4 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rcx] lea rcx,QWORD PTR[16+rcx] $L$oop_is_zero:: dec edx jz $L$oop_is_zero_done movdqu xmm1,XMMWORD PTR[rcx] lea rcx,QWORD PTR[16+rcx] por xmm0,xmm1 jmp $L$oop_is_zero $L$oop_is_zero_done:: pshufd xmm1,xmm0,04eh por xmm0,xmm1 DB 102,72,15,126,192 inc edx test rax,rax cmovnz eax,edx xor eax,1 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_is_zero_16x ENDP PUBLIC vec_is_equal_16x ALIGN 32 vec_is_equal_16x PROC PUBLIC DB 243,15,30,250 shr r8d,4 ifdef __SGX_LVI_HARDENING__ lfence endif movdqu xmm0,XMMWORD PTR[rcx] movdqu xmm1,XMMWORD PTR[rdx] sub rdx,rcx lea rcx,QWORD PTR[16+rcx] pxor xmm0,xmm1 $L$oop_is_equal:: dec r8d jz $L$oop_is_equal_done movdqu xmm1,XMMWORD PTR[rcx] movdqu xmm2,XMMWORD PTR[rdx*1+rcx] lea rcx,QWORD PTR[16+rcx] pxor xmm1,xmm2 por xmm0,xmm1 jmp $L$oop_is_equal $L$oop_is_equal_done:: pshufd xmm1,xmm0,04eh por xmm0,xmm1 DB 102,72,15,126,192 inc r8d test rax,rax cmovnz eax,r8d xor eax,1 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif vec_is_equal_16x ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_add_mod_384 DD imagerel $L$SEH_body_add_mod_384 DD imagerel $L$SEH_info_add_mod_384_prologue DD imagerel $L$SEH_body_add_mod_384 DD imagerel $L$SEH_epilogue_add_mod_384 DD imagerel $L$SEH_info_add_mod_384_body DD imagerel $L$SEH_epilogue_add_mod_384 DD imagerel $L$SEH_end_add_mod_384 DD imagerel $L$SEH_info_add_mod_384_epilogue DD imagerel $L$SEH_begin_add_mod_384x DD imagerel $L$SEH_body_add_mod_384x DD imagerel $L$SEH_info_add_mod_384x_prologue DD imagerel $L$SEH_body_add_mod_384x DD imagerel $L$SEH_epilogue_add_mod_384x DD imagerel $L$SEH_info_add_mod_384x_body DD imagerel $L$SEH_epilogue_add_mod_384x DD imagerel $L$SEH_end_add_mod_384x DD imagerel $L$SEH_info_add_mod_384x_epilogue DD imagerel $L$SEH_begin_rshift_mod_384 DD imagerel $L$SEH_body_rshift_mod_384 DD imagerel $L$SEH_info_rshift_mod_384_prologue DD imagerel $L$SEH_body_rshift_mod_384 DD imagerel $L$SEH_epilogue_rshift_mod_384 DD imagerel $L$SEH_info_rshift_mod_384_body DD imagerel $L$SEH_epilogue_rshift_mod_384 DD imagerel $L$SEH_end_rshift_mod_384 DD imagerel $L$SEH_info_rshift_mod_384_epilogue DD imagerel $L$SEH_begin_div_by_2_mod_384 DD imagerel $L$SEH_body_div_by_2_mod_384 DD imagerel $L$SEH_info_div_by_2_mod_384_prologue DD imagerel $L$SEH_body_div_by_2_mod_384 DD imagerel $L$SEH_epilogue_div_by_2_mod_384 DD imagerel $L$SEH_info_div_by_2_mod_384_body DD imagerel $L$SEH_epilogue_div_by_2_mod_384 DD imagerel $L$SEH_end_div_by_2_mod_384 DD imagerel $L$SEH_info_div_by_2_mod_384_epilogue DD imagerel $L$SEH_begin_lshift_mod_384 DD imagerel $L$SEH_body_lshift_mod_384 DD imagerel $L$SEH_info_lshift_mod_384_prologue DD imagerel $L$SEH_body_lshift_mod_384 DD imagerel $L$SEH_epilogue_lshift_mod_384 DD imagerel $L$SEH_info_lshift_mod_384_body DD imagerel $L$SEH_epilogue_lshift_mod_384 DD imagerel $L$SEH_end_lshift_mod_384 DD imagerel $L$SEH_info_lshift_mod_384_epilogue DD imagerel $L$SEH_begin_mul_by_3_mod_384 DD imagerel $L$SEH_body_mul_by_3_mod_384 DD imagerel $L$SEH_info_mul_by_3_mod_384_prologue DD imagerel $L$SEH_body_mul_by_3_mod_384 DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 DD imagerel $L$SEH_info_mul_by_3_mod_384_body DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 DD imagerel $L$SEH_end_mul_by_3_mod_384 DD imagerel $L$SEH_info_mul_by_3_mod_384_epilogue DD imagerel $L$SEH_begin_mul_by_8_mod_384 DD imagerel $L$SEH_body_mul_by_8_mod_384 DD imagerel $L$SEH_info_mul_by_8_mod_384_prologue DD imagerel $L$SEH_body_mul_by_8_mod_384 DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 DD imagerel $L$SEH_info_mul_by_8_mod_384_body DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 DD imagerel $L$SEH_end_mul_by_8_mod_384 DD imagerel $L$SEH_info_mul_by_8_mod_384_epilogue DD imagerel $L$SEH_begin_mul_by_3_mod_384x DD imagerel $L$SEH_body_mul_by_3_mod_384x DD imagerel $L$SEH_info_mul_by_3_mod_384x_prologue DD imagerel $L$SEH_body_mul_by_3_mod_384x DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x DD imagerel $L$SEH_info_mul_by_3_mod_384x_body DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x DD imagerel $L$SEH_end_mul_by_3_mod_384x DD imagerel $L$SEH_info_mul_by_3_mod_384x_epilogue DD imagerel $L$SEH_begin_mul_by_8_mod_384x DD imagerel $L$SEH_body_mul_by_8_mod_384x DD imagerel $L$SEH_info_mul_by_8_mod_384x_prologue DD imagerel $L$SEH_body_mul_by_8_mod_384x DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x DD imagerel $L$SEH_info_mul_by_8_mod_384x_body DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x DD imagerel $L$SEH_end_mul_by_8_mod_384x DD imagerel $L$SEH_info_mul_by_8_mod_384x_epilogue DD imagerel $L$SEH_begin_cneg_mod_384 DD imagerel $L$SEH_body_cneg_mod_384 DD imagerel $L$SEH_info_cneg_mod_384_prologue DD imagerel $L$SEH_body_cneg_mod_384 DD imagerel $L$SEH_epilogue_cneg_mod_384 DD imagerel $L$SEH_info_cneg_mod_384_body DD imagerel $L$SEH_epilogue_cneg_mod_384 DD imagerel $L$SEH_end_cneg_mod_384 DD imagerel $L$SEH_info_cneg_mod_384_epilogue DD imagerel $L$SEH_begin_sub_mod_384 DD imagerel $L$SEH_body_sub_mod_384 DD imagerel $L$SEH_info_sub_mod_384_prologue DD imagerel $L$SEH_body_sub_mod_384 DD imagerel $L$SEH_epilogue_sub_mod_384 DD imagerel $L$SEH_info_sub_mod_384_body DD imagerel $L$SEH_epilogue_sub_mod_384 DD imagerel $L$SEH_end_sub_mod_384 DD imagerel $L$SEH_info_sub_mod_384_epilogue DD imagerel $L$SEH_begin_sub_mod_384x DD imagerel $L$SEH_body_sub_mod_384x DD imagerel $L$SEH_info_sub_mod_384x_prologue DD imagerel $L$SEH_body_sub_mod_384x DD imagerel $L$SEH_epilogue_sub_mod_384x DD imagerel $L$SEH_info_sub_mod_384x_body DD imagerel $L$SEH_epilogue_sub_mod_384x DD imagerel $L$SEH_end_sub_mod_384x DD imagerel $L$SEH_info_sub_mod_384x_epilogue DD imagerel $L$SEH_begin_mul_by_1_plus_i_mod_384x DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_body DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x DD imagerel $L$SEH_end_mul_by_1_plus_i_mod_384x DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue DD imagerel $L$SEH_begin_sgn0_pty_mod_384 DD imagerel $L$SEH_body_sgn0_pty_mod_384 DD imagerel $L$SEH_info_sgn0_pty_mod_384_prologue DD imagerel $L$SEH_body_sgn0_pty_mod_384 DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 DD imagerel $L$SEH_info_sgn0_pty_mod_384_body DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 DD imagerel $L$SEH_end_sgn0_pty_mod_384 DD imagerel $L$SEH_info_sgn0_pty_mod_384_epilogue DD imagerel $L$SEH_begin_sgn0_pty_mod_384x DD imagerel $L$SEH_body_sgn0_pty_mod_384x DD imagerel $L$SEH_info_sgn0_pty_mod_384x_prologue DD imagerel $L$SEH_body_sgn0_pty_mod_384x DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x DD imagerel $L$SEH_info_sgn0_pty_mod_384x_body DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x DD imagerel $L$SEH_end_sgn0_pty_mod_384x DD imagerel $L$SEH_info_sgn0_pty_mod_384x_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_add_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_add_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_add_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_add_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_add_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h DB 000h,0e4h,004h,000h DB 000h,0d4h,005h,000h DB 000h,0c4h,006h,000h DB 000h,034h,007h,000h DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_add_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_rshift_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_rshift_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_rshift_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_div_by_2_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_div_by_2_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_div_by_2_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_lshift_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_lshift_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_lshift_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_by_3_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_8_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_by_8_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_8_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_by_3_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_8_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_by_8_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_8_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_cneg_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_cneg_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_cneg_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sub_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sub_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h DB 000h,0e4h,004h,000h DB 000h,0d4h,005h,000h DB 000h,0c4h,006h,000h DB 000h,034h,007h,000h DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_by_1_plus_i_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,007h,000h DB 000h,0e4h,008h,000h DB 000h,0d4h,009h,000h DB 000h,0c4h,00ah,000h DB 000h,034h,00bh,000h DB 000h,054h,00ch,000h DB 000h,074h,00eh,000h DB 000h,064h,00fh,000h DB 000h,0c2h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sgn0_pty_mod_384_body:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sgn0_pty_mod_384x_body:: DB 1,0,9,0 DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/add_mod_384x384-x86_64.asm ================================================ OPTION DOTNAME .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC add_mod_384x384 ALIGN 32 add_mod_384x384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_384x384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_add_mod_384x384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] add r8,QWORD PTR[rdx] mov r15,QWORD PTR[56+rsi] adc r9,QWORD PTR[8+rdx] mov rax,QWORD PTR[64+rsi] adc r10,QWORD PTR[16+rdx] mov rbx,QWORD PTR[72+rsi] adc r11,QWORD PTR[24+rdx] mov rbp,QWORD PTR[80+rsi] adc r12,QWORD PTR[32+rdx] mov rsi,QWORD PTR[88+rsi] adc r13,QWORD PTR[40+rdx] mov QWORD PTR[rdi],r8 adc r14,QWORD PTR[48+rdx] mov QWORD PTR[8+rdi],r9 adc r15,QWORD PTR[56+rdx] mov QWORD PTR[16+rdi],r10 adc rax,QWORD PTR[64+rdx] mov QWORD PTR[32+rdi],r12 mov r8,r14 adc rbx,QWORD PTR[72+rdx] mov QWORD PTR[24+rdi],r11 mov r9,r15 adc rbp,QWORD PTR[80+rdx] mov QWORD PTR[40+rdi],r13 mov r10,rax adc rsi,QWORD PTR[88+rdx] mov r11,rbx sbb rdx,rdx sub r14,QWORD PTR[rcx] sbb r15,QWORD PTR[8+rcx] mov r12,rbp sbb rax,QWORD PTR[16+rcx] sbb rbx,QWORD PTR[24+rcx] sbb rbp,QWORD PTR[32+rcx] mov r13,rsi sbb rsi,QWORD PTR[40+rcx] sbb rdx,0 cmovc r14,r8 cmovc r15,r9 cmovc rax,r10 mov QWORD PTR[48+rdi],r14 cmovc rbx,r11 mov QWORD PTR[56+rdi],r15 cmovc rbp,r12 mov QWORD PTR[64+rdi],rax cmovc rsi,r13 mov QWORD PTR[72+rdi],rbx mov QWORD PTR[80+rdi],rbp mov QWORD PTR[88+rdi],rsi mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_add_mod_384x384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_add_mod_384x384:: add_mod_384x384 ENDP PUBLIC sub_mod_384x384 ALIGN 32 sub_mod_384x384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_384x384:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sub_mod_384x384:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] sub r8,QWORD PTR[rdx] mov r15,QWORD PTR[56+rsi] sbb r9,QWORD PTR[8+rdx] mov rax,QWORD PTR[64+rsi] sbb r10,QWORD PTR[16+rdx] mov rbx,QWORD PTR[72+rsi] sbb r11,QWORD PTR[24+rdx] mov rbp,QWORD PTR[80+rsi] sbb r12,QWORD PTR[32+rdx] mov rsi,QWORD PTR[88+rsi] sbb r13,QWORD PTR[40+rdx] mov QWORD PTR[rdi],r8 sbb r14,QWORD PTR[48+rdx] mov r8,QWORD PTR[rcx] mov QWORD PTR[8+rdi],r9 sbb r15,QWORD PTR[56+rdx] mov r9,QWORD PTR[8+rcx] mov QWORD PTR[16+rdi],r10 sbb rax,QWORD PTR[64+rdx] mov r10,QWORD PTR[16+rcx] mov QWORD PTR[24+rdi],r11 sbb rbx,QWORD PTR[72+rdx] mov r11,QWORD PTR[24+rcx] mov QWORD PTR[32+rdi],r12 sbb rbp,QWORD PTR[80+rdx] mov r12,QWORD PTR[32+rcx] mov QWORD PTR[40+rdi],r13 sbb rsi,QWORD PTR[88+rdx] mov r13,QWORD PTR[40+rcx] sbb rdx,rdx and r8,rdx and r9,rdx and r10,rdx and r11,rdx and r12,rdx and r13,rdx add r14,r8 adc r15,r9 mov QWORD PTR[48+rdi],r14 adc rax,r10 mov QWORD PTR[56+rdi],r15 adc rbx,r11 mov QWORD PTR[64+rdi],rax adc rbp,r12 mov QWORD PTR[72+rdi],rbx adc rsi,r13 mov QWORD PTR[80+rdi],rbp mov QWORD PTR[88+rdi],rsi mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sub_mod_384x384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sub_mod_384x384:: sub_mod_384x384 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_add_mod_384x384 DD imagerel $L$SEH_body_add_mod_384x384 DD imagerel $L$SEH_info_add_mod_384x384_prologue DD imagerel $L$SEH_body_add_mod_384x384 DD imagerel $L$SEH_epilogue_add_mod_384x384 DD imagerel $L$SEH_info_add_mod_384x384_body DD imagerel $L$SEH_epilogue_add_mod_384x384 DD imagerel $L$SEH_end_add_mod_384x384 DD imagerel $L$SEH_info_add_mod_384x384_epilogue DD imagerel $L$SEH_begin_sub_mod_384x384 DD imagerel $L$SEH_body_sub_mod_384x384 DD imagerel $L$SEH_info_sub_mod_384x384_prologue DD imagerel $L$SEH_body_sub_mod_384x384 DD imagerel $L$SEH_epilogue_sub_mod_384x384 DD imagerel $L$SEH_info_sub_mod_384x384_body DD imagerel $L$SEH_epilogue_sub_mod_384x384 DD imagerel $L$SEH_end_sub_mod_384x384 DD imagerel $L$SEH_info_sub_mod_384x384_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_add_mod_384x384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_add_mod_384x384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_add_mod_384x384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384x384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sub_mod_384x384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384x384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/blst.def ================================================ LIBRARY blst EXPORTS blst_scalar_from_uint32 blst_uint32_from_scalar blst_scalar_from_uint64 blst_uint64_from_scalar blst_scalar_from_bendian blst_bendian_from_scalar blst_scalar_from_lendian blst_lendian_from_scalar blst_scalar_fr_check blst_sk_check blst_sk_add_n_check blst_sk_sub_n_check blst_sk_mul_n_check blst_sk_inverse blst_scalar_from_le_bytes blst_scalar_from_be_bytes blst_fr_add blst_fr_sub blst_fr_mul_by_3 blst_fr_lshift blst_fr_rshift blst_fr_mul blst_fr_sqr blst_fr_cneg blst_fr_eucl_inverse blst_fr_inverse blst_fr_from_uint64 blst_uint64_from_fr blst_fr_from_scalar blst_scalar_from_fr blst_fp_add blst_fp_sub blst_fp_mul_by_3 blst_fp_mul_by_8 blst_fp_lshift blst_fp_mul blst_fp_sqr blst_fp_cneg blst_fp_eucl_inverse blst_fp_inverse blst_fp_sqrt blst_fp_from_uint32 blst_uint32_from_fp blst_fp_from_uint64 blst_uint64_from_fp blst_fp_from_bendian blst_bendian_from_fp blst_fp_from_lendian blst_lendian_from_fp blst_fp2_add blst_fp2_sub blst_fp2_mul_by_3 blst_fp2_mul_by_8 blst_fp2_lshift blst_fp2_mul blst_fp2_sqr blst_fp2_cneg blst_fp2_eucl_inverse blst_fp2_inverse blst_fp2_sqrt blst_fp12_sqr blst_fp12_cyclotomic_sqr blst_fp12_mul blst_fp12_mul_by_xy00z0 blst_fp12_conjugate blst_fp12_inverse blst_fp12_frobenius_map blst_fp12_is_equal blst_fp12_is_one blst_fp12_in_group blst_fp12_one blst_p1_add blst_p1_add_or_double blst_p1_add_affine blst_p1_add_or_double_affine blst_p1_double blst_p1_mult blst_p1_cneg blst_p1_to_affine blst_p1_from_affine blst_p1_on_curve blst_p1_in_g1 blst_p1_is_equal blst_p1_is_inf blst_p1_generator blst_p1_affine_on_curve blst_p1_affine_in_g1 blst_p1_affine_is_equal blst_p1_affine_is_inf blst_p1_affine_generator blst_p2_add blst_p2_add_or_double blst_p2_add_affine blst_p2_add_or_double_affine blst_p2_double blst_p2_mult blst_p2_cneg blst_p2_to_affine blst_p2_from_affine blst_p2_on_curve blst_p2_in_g2 blst_p2_is_equal blst_p2_is_inf blst_p2_generator blst_p2_affine_on_curve blst_p2_affine_in_g2 blst_p2_affine_is_equal blst_p2_affine_is_inf blst_p2_affine_generator blst_p1s_to_affine blst_p1s_add blst_p1s_mult_wbits_precompute_sizeof blst_p1s_mult_wbits_precompute blst_p1s_mult_wbits_scratch_sizeof blst_p1s_mult_wbits blst_p1s_mult_pippenger_scratch_sizeof blst_p1s_mult_pippenger blst_p1s_tile_pippenger blst_p2s_to_affine blst_p2s_add blst_p2s_mult_wbits_precompute_sizeof blst_p2s_mult_wbits_precompute blst_p2s_mult_wbits_scratch_sizeof blst_p2s_mult_wbits blst_p2s_mult_pippenger_scratch_sizeof blst_p2s_mult_pippenger blst_p2s_tile_pippenger blst_map_to_g1 blst_map_to_g2 blst_encode_to_g1 blst_hash_to_g1 blst_encode_to_g2 blst_hash_to_g2 blst_p1_serialize blst_p1_compress blst_p1_affine_serialize blst_p1_affine_compress blst_p1_uncompress blst_p1_deserialize blst_p2_serialize blst_p2_compress blst_p2_affine_serialize blst_p2_affine_compress blst_p2_uncompress blst_p2_deserialize blst_keygen blst_sk_to_pk_in_g1 blst_sign_pk_in_g1 blst_sk_to_pk_in_g2 blst_sign_pk_in_g2 blst_miller_loop blst_miller_loop_n blst_final_exp blst_precompute_lines blst_miller_loop_lines blst_fp12_finalverify blst_pairing_sizeof blst_pairing_init blst_pairing_get_dst blst_pairing_commit blst_pairing_aggregate_pk_in_g2 blst_pairing_chk_n_aggr_pk_in_g2 blst_pairing_mul_n_aggregate_pk_in_g2 blst_pairing_chk_n_mul_n_aggr_pk_in_g2 blst_pairing_aggregate_pk_in_g1 blst_pairing_chk_n_aggr_pk_in_g1 blst_pairing_mul_n_aggregate_pk_in_g1 blst_pairing_chk_n_mul_n_aggr_pk_in_g1 blst_pairing_merge blst_pairing_finalverify blst_aggregate_in_g1 blst_aggregate_in_g2 blst_aggregated_in_g1 blst_aggregated_in_g2 blst_core_verify_pk_in_g1 blst_core_verify_pk_in_g2 BLS12_381_G1 BLS12_381_NEG_G1 BLS12_381_G2 BLS12_381_NEG_G2 blst_fr_ct_bfly blst_fr_gs_bfly blst_fr_to blst_fr_from blst_fp_to blst_fp_from blst_fp_is_square blst_fp2_is_square blst_p1_from_jacobian blst_p2_from_jacobian blst_sk_to_pk2_in_g1 blst_sign_pk2_in_g1 blst_sk_to_pk2_in_g2 blst_sign_pk2_in_g2 blst_uniq_sizeof blst_uniq_init blst_uniq_test blst_expand_message_xmd blst_p1_unchecked_mult blst_p2_unchecked_mult blst_pairing_raw_aggregate blst_pairing_as_fp12 blst_bendian_from_fp12 blst_keygen_v3 blst_keygen_v4_5 blst_keygen_v5 blst_derive_master_eip2333 blst_derive_child_eip2333 blst_scalar_from_hexascii blst_fr_from_hexascii blst_fp_from_hexascii blst_p1_sizeof blst_p1_affine_sizeof blst_p2_sizeof blst_p2_affine_sizeof blst_fp12_sizeof blst_fp_from_le_bytes blst_fp_from_be_bytes blst_sha256 ================================================ FILE: build/win64/ct_inverse_mod_256-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |ct_inverse_mod_256|[FUNC] ALIGN 32 |ct_inverse_mod_256| PROC hint #25 stp x29, x30, [sp,#-10*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] sub sp, sp, #1040 ldp x4, x5, [x1,#8*0] ldp x6, x7, [x1,#8*2] if :def: __CHERI_PURE_CAPABILITY__ add x1,sp,#16+511 alignd c1,c1,#9 scbnds c1,c1,#512 else add x1, sp, #16+511 and x1, x1, #-512 endif str x0, [sp] ldp x8, x9, [x2,#8*0] ldp x10, x11, [x2,#8*2] stp x4, x5, [x1,#8*0] stp x6, x7, [x1,#8*2] stp x8, x9, [x1,#8*4] stp x10, x11, [x1,#8*6] bl |$Lab_approximation_31_256_loaded| eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 str x12,[x0,#8*8] mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 str x12, [x0,#8*10] eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 ldr x8, [x1,#8*8] ldr x9, [x1,#8*14] madd x4, x16, x8, xzr madd x4, x17, x9, x4 asr x5, x4, #63 stp x4, x5, [x0,#8*4] stp x5, x5, [x0,#8*6] madd x4, x12, x8, xzr madd x4, x13, x9, x4 asr x5, x4, #63 stp x4, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 asr x24, x24, #63 str x24, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 asr x24, x24, #63 stp x24, x24, [x0,#8*4] stp x24, x24, [x0,#8*6] eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif bl __ab_approximation_31_256 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_256_n_shift_by_31 mov x16, x12 mov x17, x13 mov x12, x14 mov x13, x15 add x0,x0,#8*4 bl __smul_256_n_shift_by_31 add x0,x0,#8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] mov x16, x12 mov x17, x13 add x0,x0,#8*6 bl __smul_256x63 bl __smul_512x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #47 ldr x7, [x1,#8*0] ldr x11, [x1,#8*4] bl __inner_loop_62_256 mov x16, x14 mov x17, x15 ldr x0, [sp] bl __smul_256x63 bl __smul_512x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x20, x7, x17 ldp x8, x9, [x3,#8*0] adc x23, x23, x25 ldp x10, x11, [x3,#8*2] add x20, x20, x23 asr x19, x20, #63 and x23, x8, x19 and x24, x9, x19 adds x4, x4, x23 and x25, x10, x19 adcs x5, x5, x24 and x26, x11, x19 adcs x6, x6, x25 adcs x7, x22, x26 adc x20, x20, xzr neg x19, x20 orr x20, x20, x19 asr x19, x19, #63 and x8, x8, x20 and x9, x9, x20 and x10, x10, x20 and x11, x11, x20 eor x8, x8, x19 eor x9, x9, x19 adds x8, x8, x19, lsr#63 eor x10, x10, x19 adcs x9, x9, xzr eor x11, x11, x19 adcs x10, x10, xzr adc x11, x11, xzr adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*4] adc x7, x7, x11 stp x6, x7, [x0,#8*6] add sp, sp, #1040 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldr x29, [sp],#10*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__smul_256x63| PROC ldp x4, x5, [x1,#8*0+64] asr x14, x16, #63 ldp x6, x7, [x1,#8*2+64] eor x16, x16, x14 ldr x22, [x1,#8*4+64] eor x4, x4, x14 sub x16, x16, x14 eor x5, x5, x14 adds x4, x4, x14, lsr#63 eor x6, x6, x14 adcs x5, x5, xzr eor x7, x7, x14 adcs x6, x6, xzr eor x22, x22, x14 umulh x19, x4, x16 adcs x7, x7, xzr umulh x20, x5, x16 adcs x22, x22, xzr umulh x21, x6, x16 mul x4, x4, x16 cmp x16, #0 mul x5, x5, x16 cselne x22,x22,xzr mul x6, x6, x16 adds x5, x5, x19 mul x24, x7, x16 adcs x6, x6, x20 adcs x24, x24, x21 adc x26, xzr, xzr ldp x8, x9, [x1,#8*0+112] asr x14, x17, #63 ldp x10, x11, [x1,#8*2+112] eor x17, x17, x14 ldr x23, [x1,#8*4+112] eor x8, x8, x14 sub x17, x17, x14 eor x9, x9, x14 adds x8, x8, x14, lsr#63 eor x10, x10, x14 adcs x9, x9, xzr eor x11, x11, x14 adcs x10, x10, xzr eor x23, x23, x14 umulh x19, x8, x17 adcs x11, x11, xzr umulh x20, x9, x17 adcs x23, x23, xzr umulh x21, x10, x17 adc x15, xzr, xzr mul x8, x8, x17 cmp x17, #0 mul x9, x9, x17 cselne x23,x23,xzr mul x10, x10, x17 adds x9, x9, x19 mul x25, x11, x17 adcs x10, x10, x20 adcs x25, x25, x21 adc x26, x26, xzr adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*0] adcs x24, x24, x25 stp x6, x24, [x0,#8*2] ret ENDP ALIGN 32 |__smul_512x63_tail| PROC umulh x24, x7, x16 ldr x5, [x1,#8*19] adc x26, x26, xzr ldp x6, x7, [x1,#8*20] and x22, x22, x16 umulh x11, x11, x17 sub x24, x24, x22 asr x25, x24, #63 eor x5, x5, x14 eor x6, x6, x14 adds x5, x5, x15 eor x7, x7, x14 adcs x6, x6, xzr umulh x19, x23, x17 adc x7, x7, xzr umulh x20, x5, x17 add x11, x11, x26 umulh x21, x6, x17 mul x4, x23, x17 mul x5, x5, x17 adds x4, x4, x11 mul x6, x6, x17 adcs x5, x5, x19 mul x22, x7, x17 adcs x6, x6, x20 adcs x22, x22, x21 adc x23, xzr, xzr adds x4, x4, x24 adcs x5, x5, x25 adcs x6, x6, x25 stp x4, x5, [x0,#8*4] adcs x22, x22, x25 stp x6, x22, [x0,#8*6] ret ENDP ALIGN 32 |__smul_256_n_shift_by_31| PROC ldp x4, x5, [x1,#8*0+0] asr x24, x12, #63 ldp x6, x7, [x1,#8*2+0] eor x25, x12, x24 eor x4, x4, x24 sub x25, x25, x24 eor x5, x5, x24 adds x4, x4, x24, lsr#63 eor x6, x6, x24 adcs x5, x5, xzr eor x7, x7, x24 umulh x19, x4, x25 adcs x6, x6, xzr umulh x20, x5, x25 adc x7, x7, xzr umulh x21, x6, x25 and x24, x24, x25 umulh x22, x7, x25 neg x24, x24 mul x4, x4, x25 mul x5, x5, x25 mul x6, x6, x25 adds x5, x5, x19 mul x7, x7, x25 adcs x6, x6, x20 adcs x7, x7, x21 adc x22, x22, x24 ldp x8, x9, [x1,#8*0+32] asr x24, x13, #63 ldp x10, x11, [x1,#8*2+32] eor x25, x13, x24 eor x8, x8, x24 sub x25, x25, x24 eor x9, x9, x24 adds x8, x8, x24, lsr#63 eor x10, x10, x24 adcs x9, x9, xzr eor x11, x11, x24 umulh x19, x8, x25 adcs x10, x10, xzr umulh x20, x9, x25 adc x11, x11, xzr umulh x21, x10, x25 and x24, x24, x25 umulh x23, x11, x25 neg x24, x24 mul x8, x8, x25 mul x9, x9, x25 mul x10, x10, x25 adds x9, x9, x19 mul x11, x11, x25 adcs x10, x10, x20 adcs x11, x11, x21 adc x23, x23, x24 adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 adcs x7, x7, x11 adc x8, x22, x23 extr x4, x5, x4, #31 extr x5, x6, x5, #31 extr x6, x7, x6, #31 asr x23, x8, #63 extr x7, x8, x7, #31 eor x4, x4, x23 eor x5, x5, x23 adds x4, x4, x23, lsr#63 eor x6, x6, x23 adcs x5, x5, xzr eor x7, x7, x23 adcs x6, x6, xzr stp x4, x5, [x0,#8*0] adc x7, x7, xzr stp x6, x7, [x0,#8*2] eor x12, x12, x23 eor x13, x13, x23 sub x12, x12, x23 sub x13, x13, x23 ret ENDP ALIGN 16 |__ab_approximation_31_256| PROC ldp x6, x7, [x1,#8*2] ldp x10, x11, [x1,#8*6] ldp x4, x5, [x1,#8*0] ldp x8, x9, [x1,#8*4] |$Lab_approximation_31_256_loaded| orr x19, x7, x11 cmp x19, #0 cselne x7,x7,x6 cselne x11,x11,x10 cselne x6,x6,x5 orr x19, x7, x11 cselne x10,x10,x9 cmp x19, #0 cselne x7,x7,x6 cselne x11,x11,x10 cselne x6,x6,x4 orr x19, x7, x11 cselne x10,x10,x8 clz x19, x19 cmp x19, #64 cselne x19,x19,xzr cselne x7,x7,x6 cselne x11,x11,x10 neg x20, x19 lslv x7, x7, x19 lslv x11, x11, x19 lsrv x6, x6, x20 lsrv x10, x10, x20 and x6, x6, x20, asr#6 and x10, x10, x20, asr#6 orr x7, x7, x6 orr x11, x11, x10 bfxil x7, x4, #0, #31 bfxil x11, x8, #0, #31 b __inner_loop_31_256 ret ENDP ALIGN 16 |__inner_loop_31_256| PROC mov x2, #31 mov x13, #0x7FFFFFFF80000000 mov x15, #0x800000007FFFFFFF mov x23,#0x7FFFFFFF7FFFFFFF |$Loop_31_256| sbfx x22, x7, #0, #1 sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 subs x21, x7, x19 mov x19, x15 cselhs x11,x11,x7 cselhs x7,x21,x20 cselhs x15,x15,x13 cselhs x13,x13,x19 lsr x7, x7, #1 and x19, x15, x22 and x20, x23, x22 sub x13, x13, x19 add x15, x15, x15 add x13, x13, x20 sub x15, x15, x23 cbnz x2, |$Loop_31_256| mov x23, #0x7FFFFFFF ubfx x12, x13, #0, #32 ubfx x13, x13, #32, #32 ubfx x14, x15, #0, #32 ubfx x15, x15, #32, #32 sub x12, x12, x23 sub x13, x13, x23 sub x14, x14, x23 sub x15, x15, x23 ret ENDP ALIGN 16 |__inner_loop_62_256| PROC mov x12, #1 mov x13, #0 mov x14, #0 mov x15, #1 |$Loop_62_256| sbfx x22, x7, #0, #1 sub x2, x2, #1 and x19, x11, x22 sub x20, x11, x7 subs x21, x7, x19 mov x19, x12 cselhs x11,x11,x7 cselhs x7,x21,x20 mov x20, x13 cselhs x12,x12,x14 cselhs x14,x14,x19 cselhs x13,x13,x15 cselhs x15,x15,x20 lsr x7, x7, #1 and x19, x14, x22 and x20, x15, x22 add x14, x14, x14 add x15, x15, x15 sub x12, x12, x19 sub x13, x13, x20 cbnz x2, |$Loop_62_256| ret ENDP END ================================================ FILE: build/win64/ct_inverse_mod_256-x86_64.asm ================================================ OPTION DOTNAME .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC ct_inverse_mod_256 ALIGN 32 ct_inverse_mod_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ct_inverse_mod_256:: push rbp mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 push rbx push r12 push r13 push r14 push r15 sub rsp,1072 $L$SEH_body_ct_inverse_mod_256:: lea rax,QWORD PTR[((48+511))+rsp] and rax,-512 mov QWORD PTR[32+rsp],rdi mov QWORD PTR[40+rsp],rcx ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[rdx] mov r13,QWORD PTR[8+rdx] mov r14,QWORD PTR[16+rdx] mov r15,QWORD PTR[24+rdx] mov QWORD PTR[rax],r8 mov QWORD PTR[8+rax],r9 mov QWORD PTR[16+rax],r10 mov QWORD PTR[24+rax],r11 mov QWORD PTR[32+rax],r12 mov QWORD PTR[40+rax],r13 mov QWORD PTR[48+rax],r14 mov QWORD PTR[56+rax],r15 mov rsi,rax mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[64+rdi],rdx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[72+rdi],rdx xor rsi,256 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov r8,QWORD PTR[64+rsi] mov r12,QWORD PTR[104+rsi] mov r9,r8 imul r8,QWORD PTR[rsp] mov r13,r12 imul r12,QWORD PTR[8+rsp] add r8,r12 mov QWORD PTR[32+rdi],r8 sar r8,63 mov QWORD PTR[40+rdi],r8 mov QWORD PTR[48+rdi],r8 mov QWORD PTR[56+rdi],r8 mov QWORD PTR[64+rdi],r8 lea rsi,QWORD PTR[64+rsi] imul r9,rdx imul r13,rcx add r9,r13 mov QWORD PTR[72+rdi],r9 sar r9,63 mov QWORD PTR[80+rdi],r9 mov QWORD PTR[88+rdi],r9 mov QWORD PTR[96+rdi],r9 mov QWORD PTR[104+rdi],r9 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_256x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_256x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_256x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_256x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_256x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_256x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_256x63 sar rbp,63 mov QWORD PTR[40+rdi],rbp mov QWORD PTR[48+rdi],rbp mov QWORD PTR[56+rdi],rbp xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_512x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_512x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_512x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_512x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_512x63 xor rsi,256+8*8 mov edx,31 call __ab_approximation_31_256 mov QWORD PTR[16+rsp],r12 mov QWORD PTR[24+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_256_n_shift_by_31 mov QWORD PTR[rsp],rdx mov QWORD PTR[8+rsp],rcx mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[32+rdi] call __smulq_256_n_shift_by_31 mov QWORD PTR[16+rsp],rdx mov QWORD PTR[24+rsp],rcx mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[64+rsi] lea rdi,QWORD PTR[32+rdi] call __smulq_256x63 mov rdx,QWORD PTR[16+rsp] mov rcx,QWORD PTR[24+rsp] lea rdi,QWORD PTR[40+rdi] call __smulq_512x63 xor rsi,256+8*8 mov edx,47 mov r8,QWORD PTR[rsi] mov r10,QWORD PTR[32+rsi] call __inner_loop_62_256 lea rsi,QWORD PTR[64+rsi] mov rdx,r12 mov rcx,r13 mov rdi,QWORD PTR[32+rsp] call __smulq_512x63 adc rdx,rbp mov rsi,QWORD PTR[40+rsp] mov rax,rdx sar rdx,63 mov r8,rdx mov r9,rdx ifdef __SGX_LVI_HARDENING__ lfence endif and r8,QWORD PTR[rsi] mov r10,rdx and r9,QWORD PTR[8+rsi] and r10,QWORD PTR[16+rsi] and rdx,QWORD PTR[24+rsi] add r12,r8 adc r13,r9 adc r14,r10 adc r15,rdx adc rax,0 mov rdx,rax neg rax or rdx,rax sar rax,63 mov r8,rdx mov r9,rdx and r8,QWORD PTR[rsi] mov r10,rdx and r9,QWORD PTR[8+rsi] and r10,QWORD PTR[16+rsi] and rdx,QWORD PTR[24+rsi] xor r8,rax xor rcx,rcx xor r9,rax sub rcx,rax xor r10,rax xor rdx,rax add r8,rcx adc r9,0 adc r10,0 adc rdx,0 add r12,r8 adc r13,r9 adc r14,r10 adc r15,rdx mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 lea r8,QWORD PTR[1072+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_ct_inverse_mod_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_ct_inverse_mod_256:: ct_inverse_mod_256 ENDP ALIGN 32 __smulq_512x63 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov rbp,QWORD PTR[32+rsi] mov rbx,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbx,rdx add rbx,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor rbp,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc rbp,0 mul rbx mov QWORD PTR[rdi],rax mov rax,r9 mov r9,rdx mul rbx add r9,rax mov rax,r10 adc rdx,0 mov QWORD PTR[8+rdi],r9 mov r10,rdx mul rbx add r10,rax mov rax,r11 adc rdx,0 mov QWORD PTR[16+rdi],r10 mov r11,rdx and rbp,rbx neg rbp mul rbx add r11,rax adc rbp,rdx mov QWORD PTR[24+rdi],r11 mov r8,QWORD PTR[40+rsi] mov r9,QWORD PTR[48+rsi] mov r10,QWORD PTR[56+rsi] mov r11,QWORD PTR[64+rsi] mov r12,QWORD PTR[72+rsi] mov r13,QWORD PTR[80+rsi] mov r14,QWORD PTR[88+rsi] mov r15,QWORD PTR[96+rsi] mov rdx,rcx sar rdx,63 xor rax,rax sub rax,rdx xor rcx,rdx add rcx,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx xor r14,rdx xor r15,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 adc r15,0 mul rcx mov r8,rax mov rax,r9 mov r9,rdx mul rcx add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rcx add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rcx add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rcx add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx mul rcx add r13,rax mov rax,r14 adc rdx,0 mov r14,rdx mul rcx add r14,rax mov rax,r15 adc rdx,0 mov r15,rdx imul rcx add r15,rax adc rdx,0 mov rbx,rbp sar rbp,63 add r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] adc r12,rbx adc r13,rbp adc r14,rbp adc r15,rbp mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __smulq_512x63 ENDP ALIGN 32 __smulq_256x63 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] mov r10,QWORD PTR[((0+16))+rsi] mov r11,QWORD PTR[((0+24))+rsi] mov rbp,QWORD PTR[((0+32))+rsi] mov rbx,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbx,rdx add rbx,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor rbp,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc rbp,0 mul rbx mov r8,rax mov rax,r9 mov r9,rdx mul rbx add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbx add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx and rbp,rbx neg rbp mul rbx add r11,rax adc rbp,rdx mov rdx,rcx mov r12,QWORD PTR[((40+0))+rsi] mov r13,QWORD PTR[((40+8))+rsi] mov r14,QWORD PTR[((40+16))+rsi] mov r15,QWORD PTR[((40+24))+rsi] mov rcx,QWORD PTR[((40+32))+rsi] mov rbx,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbx,rdx add rbx,rax xor r12,rdx xor r13,rdx xor r14,rdx xor r15,rdx xor rcx,rdx add rax,r12 adc r13,0 adc r14,0 adc r15,0 adc rcx,0 mul rbx mov r12,rax mov rax,r13 mov r13,rdx mul rbx add r13,rax mov rax,r14 adc rdx,0 mov r14,rdx mul rbx add r14,rax mov rax,r15 adc rdx,0 mov r15,rdx and rcx,rbx neg rcx mul rbx add r15,rax adc rcx,rdx add r8,r12 adc r9,r13 adc r10,r14 adc r11,r15 adc rbp,rcx mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],rbp ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __smulq_256x63 ENDP ALIGN 32 __smulq_256_n_shift_by_31 PROC PRIVATE DB 243,15,30,250 mov QWORD PTR[rdi],rdx mov QWORD PTR[8+rdi],rcx mov rbp,rdx mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] mov r10,QWORD PTR[((0+16))+rsi] mov r11,QWORD PTR[((0+24))+rsi] mov rbx,rbp sar rbp,63 xor rax,rax sub rax,rbp xor rbx,rbp add rbx,rax xor r8,rbp xor r9,rbp xor r10,rbp xor r11,rbp add rax,r8 adc r9,0 adc r10,0 adc r11,0 mul rbx mov r8,rax mov rax,r9 and rbp,rbx neg rbp mov r9,rdx mul rbx add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbx add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rbx add r11,rax adc rbp,rdx mov r12,QWORD PTR[((32+0))+rsi] mov r13,QWORD PTR[((32+8))+rsi] mov r14,QWORD PTR[((32+16))+rsi] mov r15,QWORD PTR[((32+24))+rsi] mov rbx,rcx sar rcx,63 xor rax,rax sub rax,rcx xor rbx,rcx add rbx,rax xor r12,rcx xor r13,rcx xor r14,rcx xor r15,rcx add rax,r12 adc r13,0 adc r14,0 adc r15,0 mul rbx mov r12,rax mov rax,r13 and rcx,rbx neg rcx mov r13,rdx mul rbx add r13,rax mov rax,r14 adc rdx,0 mov r14,rdx mul rbx add r14,rax mov rax,r15 adc rdx,0 mov r15,rdx mul rbx add r15,rax adc rcx,rdx add r8,r12 adc r9,r13 adc r10,r14 adc r11,r15 adc rbp,rcx mov rdx,QWORD PTR[rdi] mov rcx,QWORD PTR[8+rdi] shrd r8,r9,31 shrd r9,r10,31 shrd r10,r11,31 shrd r11,rbp,31 sar rbp,63 xor rax,rax sub rax,rbp xor r8,rbp xor r9,rbp xor r10,rbp xor r11,rbp add r8,rax adc r9,0 adc r10,0 adc r11,0 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 xor rdx,rbp xor rcx,rbp add rdx,rax add rcx,rax ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __smulq_256_n_shift_by_31 ENDP ALIGN 32 __ab_approximation_31_256 PROC PRIVATE DB 243,15,30,250 mov r9,QWORD PTR[24+rsi] mov r11,QWORD PTR[56+rsi] mov rbx,QWORD PTR[16+rsi] mov rbp,QWORD PTR[48+rsi] mov r8,QWORD PTR[8+rsi] mov r10,QWORD PTR[40+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 mov r8,QWORD PTR[rsi] cmovz rbp,r10 mov r10,QWORD PTR[32+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 cmovz rbp,r10 mov rax,r9 or rax,r11 bsr rcx,rax lea rcx,QWORD PTR[1+rcx] cmovz r9,r8 cmovz r11,r10 cmovz rcx,rax neg rcx shld r9,rbx,cl shld r11,rbp,cl mov eax,07FFFFFFFh and r8,rax and r10,rax not rax and r9,rax and r11,rax or r8,r9 or r10,r11 jmp __inner_loop_31_256 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __ab_approximation_31_256 ENDP ALIGN 32 __inner_loop_31_256 PROC PRIVATE DB 243,15,30,250 mov rcx,07FFFFFFF80000000h mov r13,0800000007FFFFFFFh mov r15,07FFFFFFF7FFFFFFFh $L$oop_31_256:: cmp r8,r10 mov rax,r8 mov rbx,r10 mov rbp,rcx mov r14,r13 cmovb r8,r10 cmovb r10,rax cmovb rcx,r13 cmovb r13,rbp sub r8,r10 sub rcx,r13 add rcx,r15 test rax,1 cmovz r8,rax cmovz r10,rbx cmovz rcx,rbp cmovz r13,r14 shr r8,1 add r13,r13 sub r13,r15 sub edx,1 jnz $L$oop_31_256 shr r15,32 mov edx,ecx mov r12d,r13d shr rcx,32 shr r13,32 sub rdx,r15 sub rcx,r15 sub r12,r15 sub r13,r15 ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __inner_loop_31_256 ENDP ALIGN 32 __inner_loop_62_256 PROC PRIVATE DB 243,15,30,250 mov r15d,edx mov rdx,1 xor rcx,rcx xor r12,r12 mov r13,rdx mov r14,rdx $L$oop_62_256:: xor rax,rax test r8,r14 mov rbx,r10 cmovnz rax,r10 sub rbx,r8 mov rbp,r8 sub r8,rax cmovc r8,rbx cmovc r10,rbp mov rax,rdx cmovc rdx,r12 cmovc r12,rax mov rbx,rcx cmovc rcx,r13 cmovc r13,rbx xor rax,rax xor rbx,rbx shr r8,1 test rbp,r14 cmovnz rax,r12 cmovnz rbx,r13 add r12,r12 add r13,r13 sub rdx,rax sub rcx,rbx sub r15d,1 jnz $L$oop_62_256 ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __inner_loop_62_256 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_ct_inverse_mod_256 DD imagerel $L$SEH_body_ct_inverse_mod_256 DD imagerel $L$SEH_info_ct_inverse_mod_256_prologue DD imagerel $L$SEH_body_ct_inverse_mod_256 DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 DD imagerel $L$SEH_info_ct_inverse_mod_256_body DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 DD imagerel $L$SEH_end_ct_inverse_mod_256 DD imagerel $L$SEH_info_ct_inverse_mod_256_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_ct_inverse_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_ct_inverse_mod_256_body:: DB 1,0,18,0 DB 000h,0f4h,086h,000h DB 000h,0e4h,087h,000h DB 000h,0d4h,088h,000h DB 000h,0c4h,089h,000h DB 000h,034h,08ah,000h DB 000h,054h,08bh,000h DB 000h,074h,08dh,000h DB 000h,064h,08eh,000h DB 000h,001h,08ch,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_ct_inverse_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/ct_inverse_mod_384-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |ct_inverse_mod_384|[FUNC] ALIGN 32 |ct_inverse_mod_384| PROC hint #25 stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #1056 ldp x22, x4, [x1,#8*0] ldp x5, x6, [x1,#8*2] ldp x7, x8, [x1,#8*4] if :def: __CHERI_PURE_CAPABILITY__ add x1,sp,#32+511 alignd c1,c1,#9 scbnds c1,c1,#512 else add x1, sp, #32+511 and x1, x1, #-512 endif stp x0, x3, [sp] ldp x9, x10, [x2,#8*0] ldp x11, x12, [x2,#8*2] ldp x13, x14, [x2,#8*4] stp x22, x4, [x1,#8*0] stp x5, x6, [x1,#8*2] stp x7, x8, [x1,#8*4] stp x9, x10, [x1,#8*6] stp x11, x12, [x1,#8*8] stp x13, x14, [x1,#8*10] mov x2, #62 bl |$Lab_approximation_62_loaded| eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 str x15,[x0,#8*12] mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 str x15, [x0,#8*14] eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 ldr x7, [x1,#8*12] ldr x8, [x1,#8*20] mul x3, x20, x7 smulh x4, x20, x7 mul x5, x21, x8 smulh x6, x21, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] asr x5, x4, #63 stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] mul x3, x15, x7 smulh x4, x15, x7 mul x5, x16, x8 smulh x6, x16, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*14] asr x5, x4, #63 stp x5, x5, [x0,#8*16] stp x5, x5, [x0,#8*18] eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 asr x27, x27, #63 str x27, [x0,#8*6] mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 asr x27, x27, #63 stp x27, x27, [x0,#8*6] stp x27, x27, [x0,#8*8] stp x27, x27, [x0,#8*10] eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 bl __ab_approximation_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif bl __smul_384_n_shift_by_62 mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*6 bl __smul_384_n_shift_by_62 add x0,x0,#8*6 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #62 ldp x3, x8, [x1,#8*0] ldp x9, x14, [x1,#8*6] bl __inner_loop_62 eor x0, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,c1,x0 endif str x3, [x0,#8*0] str x9, [x0,#8*6] mov x20, x15 mov x21, x16 mov x15, x17 mov x16, x19 add x0,x0,#8*12 bl __smul_384x63 adc x25, x25, x26 str x25, [x0,#8*6] mov x20, x15 mov x21, x16 add x0,x0,#8*8 bl __smul_384x63 bl __smul_768x63_tail eor x1, x1, #256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,c0,x1 endif mov x2, #24 ldr x3, [x1,#8*0] eor x8, x8, x8 ldr x9, [x1,#8*6] eor x14, x14, x14 bl __inner_loop_62 mov x20, x17 mov x21, x19 ldp x0, x15, [sp] bl __smul_384x63 bl __smul_768x63_tail ldr x30, [x29,#__SIZEOF_POINTER__] smulh x23, x8, x21 adc x26, x26, x28 ldp x9, x10, [x15,#8*0] add x23, x23, x26 ldp x11, x12, [x15,#8*2] asr x22, x23, #63 ldp x13, x14, [x15,#8*4] and x26, x9, x22 and x27, x10, x22 adds x3, x3, x26 and x28, x11, x22 adcs x4, x4, x27 and x2, x12, x22 adcs x5, x5, x28 and x26, x13, x22 adcs x6, x6, x2 and x27, x14, x22 adcs x7, x7, x26 adcs x8, x25, x27 adc x23, x23, xzr neg x22, x23 orr x23, x23, x22 asr x22, x22, #63 and x9, x9, x23 and x10, x10, x23 and x11, x11, x23 and x12, x12, x23 and x13, x13, x23 and x14, x14, x23 eor x9, x9, x22 eor x10, x10, x22 adds x9, x9, x22, lsr#63 eor x11, x11, x22 adcs x10, x10, xzr eor x12, x12, x22 adcs x11, x11, xzr eor x13, x13, x22 adcs x12, x12, xzr eor x14, x14, x22 adcs x13, x13, xzr adc x14, x14, xzr adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*6] adcs x7, x7, x13 stp x5, x6, [x0,#8*8] adc x8, x8, x14 stp x7, x8, [x0,#8*10] add sp, sp, #1056 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__smul_384x63| PROC ldp x3, x4, [x1,#8*0+96] asr x17, x20, #63 ldp x5, x6, [x1,#8*2+96] eor x20, x20, x17 ldp x7, x8, [x1,#8*4+96] eor x3, x3, x17 ldr x25, [x1,#8*6+96] sub x20, x20, x17 eor x4, x4, x17 adds x3, x3, x17, lsr#63 eor x5, x5, x17 adcs x4, x4, xzr eor x6, x6, x17 adcs x5, x5, xzr eor x7, x7, x17 adcs x6, x6, xzr umulh x22, x3, x20 eor x8, x8, x17 umulh x23, x4, x20 adcs x7, x7, xzr umulh x24, x5, x20 eor x25, x25, x17 mul x3, x3, x20 adcs x8, x8, xzr mul x4, x4, x20 adcs x25, x25, xzr cmp x20, #0 mul x5, x5, x20 cselne x25,x25,xzr adds x4, x4, x22 umulh x22, x6, x20 adcs x5, x5, x23 umulh x23, x7, x20 mul x6, x6, x20 mul x7, x7, x20 adcs x6, x6, x24 mul x27,x8, x20 adcs x7, x7, x22 adcs x27,x27,x23 adc x2, xzr, xzr ldp x9, x10, [x1,#8*0+160] asr x17, x21, #63 ldp x11, x12, [x1,#8*2+160] eor x21, x21, x17 ldp x13, x14, [x1,#8*4+160] eor x9, x9, x17 ldr x26, [x1,#8*6+160] sub x21, x21, x17 eor x10, x10, x17 adds x9, x9, x17, lsr#63 eor x11, x11, x17 adcs x10, x10, xzr eor x12, x12, x17 adcs x11, x11, xzr eor x13, x13, x17 adcs x12, x12, xzr umulh x22, x9, x21 eor x14, x14, x17 umulh x23, x10, x21 adcs x13, x13, xzr umulh x24, x11, x21 eor x26, x26, x17 mul x9, x9, x21 adcs x14, x14, xzr mul x10, x10, x21 adcs x26, x26, xzr adc x19, xzr, xzr cmp x21, #0 mul x11, x11, x21 cselne x26,x26,xzr adds x10, x10, x22 umulh x22, x12, x21 adcs x11, x11, x23 umulh x23, x13, x21 mul x12, x12, x21 mul x13, x13, x21 adcs x12, x12, x24 mul x28,x14, x21 adcs x13, x13, x22 adcs x28,x28,x23 adc x2, x2, xzr adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 stp x3, x4, [x0,#8*0] adcs x7, x7, x13 stp x5, x6, [x0,#8*2] adcs x27, x27, x28 stp x7, x27, [x0,#8*4] ret ENDP ALIGN 32 |__smul_768x63_tail| PROC umulh x27, x8, x20 ldr x4, [x1,#8*27] adc x2, x2, xzr ldp x5, x6, [x1,#8*28] and x25, x25, x20 ldp x7, x8, [x1,#8*30] sub x27, x27, x25 umulh x14, x14, x21 eor x4, x4, x17 eor x5, x5, x17 eor x6, x6, x17 adds x4, x4, x19 eor x7, x7, x17 adcs x5, x5, xzr eor x8, x8, x17 adcs x6, x6, xzr umulh x22, x26, x21 adcs x7, x7, xzr umulh x23, x4, x21 adc x8, x8, xzr umulh x24, x5, x21 add x14, x14, x2 umulh x25, x6, x21 asr x28, x27, #63 umulh x2, x7, x21 mul x3, x26, x21 mul x4, x4, x21 mul x5, x5, x21 adds x3, x3, x14 mul x6, x6, x21 adcs x4, x4, x22 mul x7, x7, x21 adcs x5, x5, x23 mul x22, x8, x21 adcs x6, x6, x24 adcs x7, x7, x25 adcs x25, x22, x2 adc x26, xzr, xzr adds x3, x3, x27 adcs x4, x4, x28 adcs x5, x5, x28 adcs x6, x6, x28 stp x3, x4, [x0,#8*6] adcs x7, x7, x28 stp x5, x6, [x0,#8*8] adcs x25, x25, x28 stp x7, x25, [x0,#8*10] ret ENDP ALIGN 32 |__smul_384_n_shift_by_62| PROC ldp x3, x4, [x1,#8*0+0] asr x28, x15, #63 ldp x5, x6, [x1,#8*2+0] eor x2, x15, x28 ldp x7, x8, [x1,#8*4+0] eor x3, x3, x28 sub x2, x2, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 umulh x22, x3, x2 adcs x6, x6, xzr umulh x23, x4, x2 eor x8, x8, x28 mul x3, x3, x2 adcs x7, x7, xzr mul x4, x4, x2 adc x8, x8, xzr umulh x24, x5, x2 and x28, x28, x2 umulh x25, x6, x2 adds x4, x4, x22 mul x5, x5, x2 umulh x22, x7, x2 neg x28, x28 mul x6, x6, x2 adcs x5, x5, x23 umulh x23, x8, x2 mul x7, x7, x2 adcs x6, x6, x24 mul x8, x8, x2 adcs x7, x7, x25 adcs x8, x8, x22 adc x27, x23, x28 ldp x9, x10, [x1,#8*0+48] asr x28, x16, #63 ldp x11, x12, [x1,#8*2+48] eor x2, x16, x28 ldp x13, x14, [x1,#8*4+48] eor x9, x9, x28 sub x2, x2, x28 eor x10, x10, x28 adds x9, x9, x28, lsr#63 eor x11, x11, x28 adcs x10, x10, xzr eor x12, x12, x28 adcs x11, x11, xzr eor x13, x13, x28 umulh x22, x9, x2 adcs x12, x12, xzr umulh x23, x10, x2 eor x14, x14, x28 mul x9, x9, x2 adcs x13, x13, xzr mul x10, x10, x2 adc x14, x14, xzr umulh x24, x11, x2 and x28, x28, x2 umulh x25, x12, x2 adds x10, x10, x22 mul x11, x11, x2 umulh x22, x13, x2 neg x28, x28 mul x12, x12, x2 adcs x11, x11, x23 umulh x23, x14, x2 mul x13, x13, x2 adcs x12, x12, x24 mul x14, x14, x2 adcs x13, x13, x25 adcs x14, x14, x22 adc x28, x23, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x27, x28 extr x3, x4, x3, #62 extr x4, x5, x4, #62 extr x5, x6, x5, #62 asr x28, x9, #63 extr x6, x7, x6, #62 extr x7, x8, x7, #62 extr x8, x9, x8, #62 eor x3, x3, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 eor x5, x5, x28 adcs x4, x4, xzr eor x6, x6, x28 adcs x5, x5, xzr eor x7, x7, x28 adcs x6, x6, xzr eor x8, x8, x28 stp x3, x4, [x0,#8*0] adcs x7, x7, xzr stp x5, x6, [x0,#8*2] adc x8, x8, xzr stp x7, x8, [x0,#8*4] eor x15, x15, x28 eor x16, x16, x28 sub x15, x15, x28 sub x16, x16, x28 ret ENDP ALIGN 16 |__ab_approximation_62| PROC ldp x7, x8, [x1,#8*4] ldp x13, x14, [x1,#8*10] ldp x5, x6, [x1,#8*2] ldp x11, x12, [x1,#8*8] |$Lab_approximation_62_loaded| orr x22, x8, x14 cmp x22, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x6 orr x22, x8, x14 cselne x13,x13,x12 ldp x3, x4, [x1,#8*0] ldp x9, x10, [x1,#8*6] cmp x22, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x5 orr x22, x8, x14 cselne x13,x13,x11 cmp x22, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x4 orr x22, x8, x14 cselne x13,x13,x10 clz x22, x22 cmp x22, #64 cselne x22,x22,xzr cselne x8,x8,x7 cselne x14,x14,x13 neg x23, x22 lslv x8, x8, x22 lslv x14, x14, x22 lsrv x7, x7, x23 lsrv x13, x13, x23 and x7, x7, x23, asr#6 and x13, x13, x23, asr#6 orr x8, x8, x7 orr x14, x14, x13 b __inner_loop_62 ret ENDP ALIGN 16 |__inner_loop_62| PROC mov x15, #1 mov x16, #0 mov x17, #0 mov x19, #1 |$Loop_62| sbfx x28, x3, #0, #1 sub x2, x2, #1 subs x24, x9, x3 and x22, x9, x28 sbc x25, x14, x8 and x23, x14, x28 subs x26, x3, x22 mov x22, x15 sbcs x27, x8, x23 mov x23, x16 cselhs x9,x9,x3 cselhs x14,x14,x8 cselhs x3,x26,x24 cselhs x8,x27,x25 cselhs x15,x15,x17 cselhs x17,x17,x22 cselhs x16,x16,x19 cselhs x19,x19,x23 extr x3, x8, x3, #1 lsr x8, x8, #1 and x22, x17, x28 and x23, x19, x28 add x17, x17, x17 add x19, x19, x19 sub x15, x15, x22 sub x16, x16, x23 cbnz x2, |$Loop_62| ret ENDP END ================================================ FILE: build/win64/ct_is_square_mod_384-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |ct_is_square_mod_384|[FUNC] ALIGN 32 |ct_is_square_mod_384| PROC hint #25 stp x29, x30, [sp,#-16*__SIZEOF_POINTER__]! add x29, sp, #0 stp x19, x20, [sp,#2*__SIZEOF_POINTER__] stp x21, x22, [sp,#4*__SIZEOF_POINTER__] stp x23, x24, [sp,#6*__SIZEOF_POINTER__] stp x25, x26, [sp,#8*__SIZEOF_POINTER__] stp x27, x28, [sp,#10*__SIZEOF_POINTER__] sub sp, sp, #512 ldp x3, x4, [x0,#8*0] ldp x5, x6, [x0,#8*2] ldp x7, x8, [x0,#8*4] add x0, sp, #255 and x0, x0, #-256 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 endif ldp x9, x10, [x1,#8*0] ldp x11, x12, [x1,#8*2] ldp x13, x14, [x1,#8*4] stp x3, x4, [x0,#8*6] stp x5, x6, [x0,#8*8] stp x7, x8, [x0,#8*10] stp x9, x10, [x0,#8*0] stp x11, x12, [x0,#8*2] stp x13, x14, [x0,#8*4] eor x2, x2, x2 mov x15, #24 b |$Loop_is_square| ALIGN 16 |$Loop_is_square| bl __ab_approximation_30 sub x15, x15, #1 eor x1, x0, #128 if :def: __CHERI_PURE_CAPABILITY__ scvalue c1,csp,x1 endif bl __smul_384_n_shift_by_30 mov x19, x16 mov x20, x17 add x1,x1,#8*6 bl __smul_384_n_shift_by_30 ldp x9, x10, [x1,#-8*6] eor x0, x0, #128 if :def: __CHERI_PURE_CAPABILITY__ scvalue c0,csp,x0 endif and x27, x27, x9 add x2, x2, x27, lsr#1 cbnz x15, |$Loop_is_square| mov x15, #48 bl __inner_loop_48 ldr x30, [x29,#__SIZEOF_POINTER__] and x0, x2, #1 eor x0, x0, #1 add sp, sp, #512 ldp x19, x20, [x29,#2*__SIZEOF_POINTER__] ldp x21, x22, [x29,#4*__SIZEOF_POINTER__] ldp x23, x24, [x29,#6*__SIZEOF_POINTER__] ldp x25, x26, [x29,#8*__SIZEOF_POINTER__] ldp x27, x28, [x29,#10*__SIZEOF_POINTER__] ldr x29, [sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__smul_384_n_shift_by_30| PROC ldp x3, x4, [x0,#8*0+0] asr x27, x20, #63 ldp x5, x6, [x0,#8*2+0] eor x20, x20, x27 ldp x7, x8, [x0,#8*4+0] eor x3, x3, x27 sub x20, x20, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 umulh x21, x3, x20 adcs x6, x6, xzr umulh x22, x4, x20 eor x8, x8, x27 umulh x23, x5, x20 adcs x7, x7, xzr umulh x24, x6, x20 adc x8, x8, xzr umulh x25, x7, x20 and x28, x20, x27 umulh x26, x8, x20 neg x28, x28 mul x3, x3, x20 mul x4, x4, x20 mul x5, x5, x20 adds x4, x4, x21 mul x6, x6, x20 adcs x5, x5, x22 mul x7, x7, x20 adcs x6, x6, x23 mul x8, x8, x20 adcs x7, x7, x24 adcs x8, x8 ,x25 adc x26, x26, x28 ldp x9, x10, [x0,#8*0+48] asr x27, x19, #63 ldp x11, x12, [x0,#8*2+48] eor x19, x19, x27 ldp x13, x14, [x0,#8*4+48] eor x9, x9, x27 sub x19, x19, x27 eor x10, x10, x27 adds x9, x9, x27, lsr#63 eor x11, x11, x27 adcs x10, x10, xzr eor x12, x12, x27 adcs x11, x11, xzr eor x13, x13, x27 umulh x21, x9, x19 adcs x12, x12, xzr umulh x22, x10, x19 eor x14, x14, x27 umulh x23, x11, x19 adcs x13, x13, xzr umulh x24, x12, x19 adc x14, x14, xzr umulh x25, x13, x19 and x28, x19, x27 umulh x27, x14, x19 neg x28, x28 mul x9, x9, x19 mul x10, x10, x19 mul x11, x11, x19 adds x10, x10, x21 mul x12, x12, x19 adcs x11, x11, x22 mul x13, x13, x19 adcs x12, x12, x23 mul x14, x14, x19 adcs x13, x13, x24 adcs x14, x14 ,x25 adc x27, x27, x28 adds x3, x3, x9 adcs x4, x4, x10 adcs x5, x5, x11 adcs x6, x6, x12 adcs x7, x7, x13 adcs x8, x8, x14 adc x9, x26, x27 extr x3, x4, x3, #30 extr x4, x5, x4, #30 extr x5, x6, x5, #30 asr x27, x9, #63 extr x6, x7, x6, #30 extr x7, x8, x7, #30 extr x8, x9, x8, #30 eor x3, x3, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 eor x5, x5, x27 adcs x4, x4, xzr eor x6, x6, x27 adcs x5, x5, xzr eor x7, x7, x27 adcs x6, x6, xzr eor x8, x8, x27 stp x3, x4, [x1,#8*0] adcs x7, x7, xzr stp x5, x6, [x1,#8*2] adc x8, x8, xzr stp x7, x8, [x1,#8*4] ret ENDP ALIGN 16 |__ab_approximation_30| PROC ldp x13, x14, [x0,#8*4] ldp x11, x12, [x0,#8*2] orr x21, x8, x14 cmp x21, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x6 orr x21, x8, x14 cselne x13,x13,x12 cmp x21, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x5 orr x21, x8, x14 cselne x13,x13,x11 cmp x21, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x4 orr x21, x8, x14 cselne x13,x13,x10 cmp x21, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x3 orr x21, x8, x14 cselne x13,x13,x9 clz x21, x21 cmp x21, #64 cselne x21,x21,xzr cselne x8,x8,x7 cselne x14,x14,x13 neg x22, x21 lslv x8, x8, x21 lslv x14, x14, x21 lsrv x7, x7, x22 lsrv x13, x13, x22 and x7, x7, x22, asr#6 and x13, x13, x22, asr#6 orr x8, x8, x7 orr x14, x14, x13 bfxil x8, x3, #0, #32 bfxil x14, x9, #0, #32 b __inner_loop_30 ret ENDP ALIGN 16 |__inner_loop_30| PROC mov x28, #30 mov x17, #0x7FFFFFFF80000000 mov x20, #0x800000007FFFFFFF mov x27,#0x7FFFFFFF7FFFFFFF |$Loop_30| sbfx x24, x8, #0, #1 and x25, x8, x14 sub x28, x28, #1 and x21, x14, x24 sub x22, x14, x8 subs x23, x8, x21 add x25, x2, x25, lsr#1 mov x21, x20 cselhs x14,x14,x8 cselhs x8,x23,x22 cselhs x20,x20,x17 cselhs x17,x17,x21 cselhs x2,x2,x25 lsr x8, x8, #1 and x21, x20, x24 and x22, x27, x24 add x23, x14, #2 sub x17, x17, x21 add x20, x20, x20 add x2, x2, x23, lsr#2 add x17, x17, x22 sub x20, x20, x27 cbnz x28, |$Loop_30| mov x27, #0x7FFFFFFF ubfx x16, x17, #0, #32 ubfx x17, x17, #32, #32 ubfx x19, x20, #0, #32 ubfx x20, x20, #32, #32 sub x16, x16, x27 sub x17, x17, x27 sub x19, x19, x27 sub x20, x20, x27 ret ENDP ALIGN 16 |__inner_loop_48| PROC |$Loop_48| sbfx x24, x3, #0, #1 and x25, x3, x9 sub x15, x15, #1 and x21, x9, x24 sub x22, x9, x3 subs x23, x3, x21 add x25, x2, x25, lsr#1 cselhs x9,x9,x3 cselhs x3,x23,x22 cselhs x2,x2,x25 add x23, x9, #2 lsr x3, x3, #1 add x2, x2, x23, lsr#2 cbnz x15, |$Loop_48| ret ENDP END ================================================ FILE: build/win64/ct_is_square_mod_384-x86_64.asm ================================================ OPTION DOTNAME .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC ct_is_square_mod_384 ALIGN 32 ct_is_square_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ct_is_square_mod_384:: push rbp mov rdi,rcx mov rsi,rdx push rbx push r12 push r13 push r14 push r15 sub rsp,536 $L$SEH_body_ct_is_square_mod_384:: lea rax,QWORD PTR[((24+255))+rsp] and rax,-256 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] mov r10,QWORD PTR[16+rdi] mov r11,QWORD PTR[24+rdi] mov r12,QWORD PTR[32+rdi] mov r13,QWORD PTR[40+rdi] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rbx,QWORD PTR[16+rsi] mov rcx,QWORD PTR[24+rsi] mov rdx,QWORD PTR[32+rsi] mov rdi,QWORD PTR[40+rsi] mov rsi,rax mov QWORD PTR[rax],r8 mov QWORD PTR[8+rax],r9 mov QWORD PTR[16+rax],r10 mov QWORD PTR[24+rax],r11 mov QWORD PTR[32+rax],r12 mov QWORD PTR[40+rax],r13 mov QWORD PTR[48+rax],r14 mov QWORD PTR[56+rax],r15 mov QWORD PTR[64+rax],rbx mov QWORD PTR[72+rax],rcx mov QWORD PTR[80+rax],rdx mov QWORD PTR[88+rax],rdi xor rbp,rbp mov ecx,24 jmp $L$oop_is_square ALIGN 32 $L$oop_is_square:: mov DWORD PTR[16+rsp],ecx call __ab_approximation_30 mov QWORD PTR[rsp],rax mov QWORD PTR[8+rsp],rbx mov rdi,128+8*6 xor rdi,rsi call __smulq_384_n_shift_by_30 mov rdx,QWORD PTR[rsp] mov rcx,QWORD PTR[8+rsp] lea rdi,QWORD PTR[((-48))+rdi] call __smulq_384_n_shift_by_30 mov ecx,DWORD PTR[16+rsp] xor rsi,128 and r14,QWORD PTR[48+rdi] shr r14,1 add rbp,r14 sub ecx,1 jnz $L$oop_is_square mov r9,QWORD PTR[48+rsi] call __inner_loop_48 mov rax,1 and rax,rbp xor rax,1 lea r8,QWORD PTR[536+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_ct_is_square_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_ct_is_square_mod_384:: ct_is_square_mod_384 ENDP ALIGN 32 __smulq_384_n_shift_by_30 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rbx,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbx,rdx add rbx,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 mov r14,rdx and r14,rbx mul rbx mov r8,rax mov rax,r9 mov r9,rdx mul rbx add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbx add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rbx add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rbx add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx neg r14 mul rbx add r13,rax adc r14,rdx lea rsi,QWORD PTR[48+rsi] mov rdx,rcx mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rbx,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbx,rdx add rbx,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 mov r15,rdx and r15,rbx mul rbx mov r8,rax mov rax,r9 mov r9,rdx mul rbx add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbx add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rbx add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rbx add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx neg r15 mul rbx add r13,rax adc r15,rdx lea rsi,QWORD PTR[((-48))+rsi] add r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] adc r12,QWORD PTR[32+rdi] adc r13,QWORD PTR[40+rdi] adc r14,r15 shrd r8,r9,30 shrd r9,r10,30 shrd r10,r11,30 shrd r11,r12,30 shrd r12,r13,30 shrd r13,r14,30 sar r14,63 xor rbx,rbx sub rbx,r14 xor r8,r14 xor r9,r14 xor r10,r14 xor r11,r14 xor r12,r14 xor r13,r14 add r8,rbx adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __smulq_384_n_shift_by_30 ENDP ALIGN 32 __ab_approximation_30 PROC PRIVATE DB 243,15,30,250 mov rbx,QWORD PTR[88+rsi] mov r15,QWORD PTR[80+rsi] mov r14,QWORD PTR[72+rsi] mov rax,r13 or rax,rbx cmovz r13,r12 cmovz rbx,r15 cmovz r12,r11 mov r11,QWORD PTR[64+rsi] cmovz r15,r14 mov rax,r13 or rax,rbx cmovz r13,r12 cmovz rbx,r15 cmovz r12,r10 mov r10,QWORD PTR[56+rsi] cmovz r15,r11 mov rax,r13 or rax,rbx cmovz r13,r12 cmovz rbx,r15 cmovz r12,r9 mov r9,QWORD PTR[48+rsi] cmovz r15,r10 mov rax,r13 or rax,rbx cmovz r13,r12 cmovz rbx,r15 cmovz r12,r8 cmovz r15,r9 mov rax,r13 or rax,rbx bsr rcx,rax lea rcx,QWORD PTR[1+rcx] cmovz r13,r8 cmovz rbx,r9 cmovz rcx,rax neg rcx shld r13,r12,cl shld rbx,r15,cl mov rax,0FFFFFFFF00000000h mov r8d,r8d mov r9d,r9d and r13,rax and rbx,rax or r8,r13 or r9,rbx jmp __inner_loop_30 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __ab_approximation_30 ENDP ALIGN 32 __inner_loop_30 PROC PRIVATE DB 243,15,30,250 mov rbx,07FFFFFFF80000000h mov rcx,0800000007FFFFFFFh lea r15,QWORD PTR[((-1))+rbx] mov edi,30 $L$oop_30:: mov rax,r8 and rax,r9 shr rax,1 cmp r8,r9 mov r10,r8 mov r11,r9 lea rax,QWORD PTR[rbp*1+rax] mov r12,rbx mov r13,rcx mov r14,rbp cmovb r8,r9 cmovb r9,r10 cmovb rbx,rcx cmovb rcx,r12 cmovb rbp,rax sub r8,r9 sub rbx,rcx add rbx,r15 test r10,1 cmovz r8,r10 cmovz r9,r11 cmovz rbx,r12 cmovz rcx,r13 cmovz rbp,r14 lea rax,QWORD PTR[2+r9] shr r8,1 shr rax,2 add rcx,rcx lea rbp,QWORD PTR[rbp*1+rax] sub rcx,r15 sub edi,1 jnz $L$oop_30 shr r15,32 mov eax,ebx shr rbx,32 mov edx,ecx shr rcx,32 sub rax,r15 sub rbx,r15 sub rdx,r15 sub rcx,r15 ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __inner_loop_30 ENDP ALIGN 32 __inner_loop_48 PROC PRIVATE DB 243,15,30,250 mov edi,48 $L$oop_48:: mov rax,r8 and rax,r9 shr rax,1 cmp r8,r9 mov r10,r8 mov r11,r9 lea rax,QWORD PTR[rbp*1+rax] mov r12,rbp cmovb r8,r9 cmovb r9,r10 cmovb rbp,rax sub r8,r9 test r10,1 cmovz r8,r10 cmovz r9,r11 cmovz rbp,r12 lea rax,QWORD PTR[2+r9] shr r8,1 shr rax,2 add rbp,rax sub edi,1 jnz $L$oop_48 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __inner_loop_48 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_ct_is_square_mod_384 DD imagerel $L$SEH_body_ct_is_square_mod_384 DD imagerel $L$SEH_info_ct_is_square_mod_384_prologue DD imagerel $L$SEH_body_ct_is_square_mod_384 DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 DD imagerel $L$SEH_info_ct_is_square_mod_384_body DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 DD imagerel $L$SEH_end_ct_is_square_mod_384 DD imagerel $L$SEH_info_ct_is_square_mod_384_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_ct_is_square_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_ct_is_square_mod_384_body:: DB 1,0,18,0 DB 000h,0f4h,043h,000h DB 000h,0e4h,044h,000h DB 000h,0d4h,045h,000h DB 000h,0c4h,046h,000h DB 000h,034h,047h,000h DB 000h,054h,048h,000h DB 000h,074h,04ah,000h DB 000h,064h,04bh,000h DB 000h,001h,049h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_ct_is_square_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/ctq_inverse_mod_384-x86_64.asm ================================================ OPTION DOTNAME EXTERN ct_inverse_mod_384$1:NEAR _DATA SEGMENT COMM __blst_platform_cap:DWORD:1 _DATA ENDS .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC ct_inverse_mod_384 ALIGN 32 ct_inverse_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ct_inverse_mod_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz ct_inverse_mod_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,1112 $L$SEH_body_ct_inverse_mod_384:: lea rax,QWORD PTR[((88+511))+rsp] and rax,-512 mov QWORD PTR[32+rsp],rdi mov QWORD PTR[40+rsp],rcx mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[rdx] mov r15,QWORD PTR[8+rdx] mov rbx,QWORD PTR[16+rdx] mov rbp,QWORD PTR[24+rdx] mov rsi,QWORD PTR[32+rdx] mov rdi,QWORD PTR[40+rdx] mov QWORD PTR[rax],r8 mov QWORD PTR[8+rax],r9 mov QWORD PTR[16+rax],r10 mov QWORD PTR[24+rax],r11 mov QWORD PTR[32+rax],r12 mov QWORD PTR[40+rax],r13 mov QWORD PTR[48+rax],r14 mov QWORD PTR[56+rax],r15 mov QWORD PTR[64+rax],rbx mov QWORD PTR[72+rax],rbp mov QWORD PTR[80+rax],rsi mov rsi,rax mov QWORD PTR[88+rax],rdi mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[96+rdi],rdx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[104+rdi],rdx xor rsi,256 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov rax,QWORD PTR[96+rsi] mov r11,QWORD PTR[152+rsi] mov rbx,rdx mov r10,rax imul QWORD PTR[56+rsp] mov r8,rax mov rax,r11 mov r9,rdx imul QWORD PTR[64+rsp] add r8,rax adc r9,rdx mov QWORD PTR[48+rdi],r8 mov QWORD PTR[56+rdi],r9 sar r9,63 mov QWORD PTR[64+rdi],r9 mov QWORD PTR[72+rdi],r9 mov QWORD PTR[80+rdi],r9 mov QWORD PTR[88+rdi],r9 mov QWORD PTR[96+rdi],r9 lea rsi,QWORD PTR[96+rsi] mov rax,r10 imul rbx mov r8,rax mov rax,r11 mov r9,rdx imul rcx add r8,rax adc r9,rdx mov QWORD PTR[104+rdi],r8 mov QWORD PTR[112+rdi],r9 sar r9,63 mov QWORD PTR[120+rdi],r9 mov QWORD PTR[128+rdi],r9 mov QWORD PTR[136+rdi],r9 mov QWORD PTR[144+rdi],r9 mov QWORD PTR[152+rdi],r9 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_384x63 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_384x63 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_384x63 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_384x63 mov QWORD PTR[56+rdi],r14 mov QWORD PTR[64+rdi],r14 mov QWORD PTR[72+rdi],r14 mov QWORD PTR[80+rdi],r14 mov QWORD PTR[88+rdi],r14 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_768x63 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_768x63 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_768x63 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_768x63 xor rsi,256+8*12 mov edi,62 call __ab_approximation_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulq_384_n_shift_by_62 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulq_384_n_shift_by_62 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_768x63 xor rsi,256+8*12 mov edi,62 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[48+rsi] mov r11,QWORD PTR[56+rsi] call __inner_loop_62 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi mov QWORD PTR[rdi],r8 mov QWORD PTR[48+rdi],r10 lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[96+rdi] call __smulq_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulq_768x63 xor rsi,256+8*12 mov edi,24 mov r8,QWORD PTR[rsi] xor r9,r9 mov r10,QWORD PTR[48+rsi] xor r11,r11 call __inner_loop_62 lea rsi,QWORD PTR[96+rsi] mov rdx,r12 mov rcx,r13 mov rdi,QWORD PTR[32+rsp] call __smulq_768x63 mov rsi,QWORD PTR[40+rsp] mov r13,rdx sar r13,63 mov r8,r13 mov r9,r13 mov r10,r13 and r8,QWORD PTR[rsi] and r9,QWORD PTR[8+rsi] mov r11,r13 and r10,QWORD PTR[16+rsi] and r11,QWORD PTR[24+rsi] mov r12,r13 and r12,QWORD PTR[32+rsi] and r13,QWORD PTR[40+rsi] add r14,r8 adc r15,r9 adc rbx,r10 adc rbp,r11 adc rcx,r12 adc rax,r13 adc rdx,0 mov r13,rdx neg rdx or r13,rdx sar rdx,63 mov r8,r13 mov r9,r13 mov r10,r13 and r8,QWORD PTR[rsi] and r9,QWORD PTR[8+rsi] mov r11,r13 and r10,QWORD PTR[16+rsi] and r11,QWORD PTR[24+rsi] mov r12,r13 and r12,QWORD PTR[32+rsi] and r13,QWORD PTR[40+rsi] xor r8,rdx xor rsi,rsi xor r9,rdx sub rsi,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx add r8,rsi adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 add r14,r8 adc r15,r9 adc rbx,r10 adc rbp,r11 adc rcx,r12 adc rax,r13 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 mov QWORD PTR[64+rdi],rbx mov QWORD PTR[72+rdi],rbp mov QWORD PTR[80+rdi],rcx mov QWORD PTR[88+rdi],rax lea r8,QWORD PTR[1112+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_ct_inverse_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_ct_inverse_mod_384:: ct_inverse_mod_384 ENDP ALIGN 32 __smulq_768x63 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] mov rbp,rdx sar rdx,63 xor rax,rax sub rax,rdx mov QWORD PTR[8+rsp],rdi mov QWORD PTR[16+rsp],rsi lea rsi,QWORD PTR[56+rsi] xor rbp,rdx add rbp,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx xor r14,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 mul rbp mov QWORD PTR[rdi],rax mov rax,r9 and r14,rbp neg r14 mov r9,rdx mul rbp add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mov QWORD PTR[8+rdi],r9 mul rbp add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mov QWORD PTR[16+rdi],r10 mul rbp add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mov QWORD PTR[24+rdi],r11 mul rbp add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx mov QWORD PTR[32+rdi],r12 mul rbp add r13,rax adc r14,rdx mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 sar r14,63 mov QWORD PTR[56+rdi],r14 mov rdx,rcx mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] mov r15,QWORD PTR[56+rsi] mov rbx,QWORD PTR[64+rsi] mov rbp,QWORD PTR[72+rsi] mov rcx,QWORD PTR[80+rsi] mov rdi,QWORD PTR[88+rsi] mov rsi,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rsi,rdx add rsi,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx xor r14,rdx xor r15,rdx xor rbx,rdx xor rbp,rdx xor rcx,rdx xor rdi,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 adc r15,0 adc rbx,0 adc rbp,0 adc rcx,0 adc rdi,0 mul rsi mov r8,rax mov rax,r9 mov r9,rdx mul rsi add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rsi add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rsi add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rsi add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx mul rsi add r13,rax mov rax,r14 adc rdx,0 mov r14,rdx mul rsi add r14,rax mov rax,r15 adc rdx,0 mov r15,rdx mul rsi add r15,rax mov rax,rbx adc rdx,0 mov rbx,rdx mul rsi add rbx,rax mov rax,rbp adc rdx,0 mov rbp,rdx mul rsi add rbp,rax mov rax,rcx adc rdx,0 mov rcx,rdx mul rsi add rcx,rax mov rax,rdi adc rdx,0 mov rdi,rdx imul rsi mov rsi,QWORD PTR[8+rsp] add rax,rdi adc rdx,0 add r8,QWORD PTR[rsi] adc r9,QWORD PTR[8+rsi] adc r10,QWORD PTR[16+rsi] adc r11,QWORD PTR[24+rsi] adc r12,QWORD PTR[32+rsi] adc r13,QWORD PTR[40+rsi] adc r14,QWORD PTR[48+rsi] mov rdi,QWORD PTR[56+rsi] adc r15,rdi adc rbx,rdi adc rbp,rdi adc rcx,rdi adc rax,rdi adc rdx,rdi lea rdi,QWORD PTR[rsi] mov rsi,QWORD PTR[16+rsp] mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 mov QWORD PTR[64+rdi],rbx mov QWORD PTR[72+rdi],rbp mov QWORD PTR[80+rdi],rcx mov QWORD PTR[88+rdi],rax ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __smulq_768x63 ENDP ALIGN 32 __smulq_384x63 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] mov rbp,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbp,rdx add rbp,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx xor r14,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 mul rbp mov r8,rax mov rax,r9 and r14,rbp neg r14 mov r9,rdx mul rbp add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbp add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rbp add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rbp add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx mul rbp add r13,rax adc r14,rdx lea rsi,QWORD PTR[56+rsi] mov rdx,rcx mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov r15,r13 mov rbx,r14 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] mov rbp,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbp,rdx add rbp,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx xor r14,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 mul rbp mov r8,rax mov rax,r9 and r14,rbp neg r14 mov r9,rdx mul rbp add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbp add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rbp add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rbp add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx mul rbp add r13,rax adc r14,rdx lea rsi,QWORD PTR[((-56))+rsi] add r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] adc r12,QWORD PTR[32+rdi] adc r13,r15 adc r14,rbx mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __smulq_384x63 ENDP ALIGN 32 __smulq_384_n_shift_by_62 PROC PRIVATE DB 243,15,30,250 mov rbx,rdx mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rbp,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbp,rdx add rbp,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx mov r14,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 mul rbp mov r8,rax mov rax,r9 and r14,rbp neg r14 mov r9,rdx mul rbp add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbp add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rbp add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rbp add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx mul rbp add r13,rax adc r14,rdx lea rsi,QWORD PTR[48+rsi] mov rdx,rcx mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov rbp,rdx sar rdx,63 xor rax,rax sub rax,rdx xor rbp,rdx add rbp,rax xor r8,rdx xor r9,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx mov r15,rdx add rax,r8 adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 mul rbp mov r8,rax mov rax,r9 and r15,rbp neg r15 mov r9,rdx mul rbp add r9,rax mov rax,r10 adc rdx,0 mov r10,rdx mul rbp add r10,rax mov rax,r11 adc rdx,0 mov r11,rdx mul rbp add r11,rax mov rax,r12 adc rdx,0 mov r12,rdx mul rbp add r12,rax mov rax,r13 adc rdx,0 mov r13,rdx mul rbp add r13,rax adc r15,rdx lea rsi,QWORD PTR[((-48))+rsi] mov rdx,rbx add r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] adc r12,QWORD PTR[32+rdi] adc r13,QWORD PTR[40+rdi] adc r14,r15 shrd r8,r9,62 shrd r9,r10,62 shrd r10,r11,62 shrd r11,r12,62 shrd r12,r13,62 shrd r13,r14,62 sar r14,63 xor rbp,rbp sub rbp,r14 xor r8,r14 xor r9,r14 xor r10,r14 xor r11,r14 xor r12,r14 xor r13,r14 add r8,rbp adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 xor rdx,r14 xor rcx,r14 add rdx,rbp add rcx,rbp ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __smulq_384_n_shift_by_62 ENDP ALIGN 32 __ab_approximation_62 PROC PRIVATE DB 243,15,30,250 mov r9,QWORD PTR[40+rsi] mov r11,QWORD PTR[88+rsi] mov rbx,QWORD PTR[32+rsi] mov rbp,QWORD PTR[80+rsi] mov r8,QWORD PTR[24+rsi] mov r10,QWORD PTR[72+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 cmovz rbp,r10 mov r8,QWORD PTR[16+rsi] mov r10,QWORD PTR[64+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 cmovz rbp,r10 mov r8,QWORD PTR[8+rsi] mov r10,QWORD PTR[56+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 cmovz rbp,r10 mov r8,QWORD PTR[rsi] mov r10,QWORD PTR[48+rsi] mov rax,r9 or rax,r11 bsr rcx,rax lea rcx,QWORD PTR[1+rcx] cmovz r9,rbx cmovz r11,rbp cmovz rcx,rax neg rcx shld r9,rbx,cl shld r11,rbp,cl jmp __inner_loop_62 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __ab_approximation_62 ENDP ALIGN 8 DD 0 __inner_loop_62 PROC PRIVATE DB 243,15,30,250 mov rdx,1 xor rcx,rcx xor r12,r12 mov r13,1 mov QWORD PTR[8+rsp],rsi $L$oop_62:: xor rax,rax xor rbx,rbx test r8,1 mov rbp,r10 mov r14,r11 cmovnz rax,r10 cmovnz rbx,r11 sub rbp,r8 sbb r14,r9 mov r15,r8 mov rsi,r9 sub r8,rax sbb r9,rbx cmovc r8,rbp cmovc r9,r14 cmovc r10,r15 cmovc r11,rsi mov rax,rdx cmovc rdx,r12 cmovc r12,rax mov rbx,rcx cmovc rcx,r13 cmovc r13,rbx xor rax,rax xor rbx,rbx shrd r8,r9,1 shr r9,1 test r15,1 cmovnz rax,r12 cmovnz rbx,r13 add r12,r12 add r13,r13 sub rdx,rax sub rcx,rbx sub edi,1 jnz $L$oop_62 mov rsi,QWORD PTR[8+rsp] ifdef __SGX_LVI_HARDENING__ pop rax lfence jmp rax ud2 else DB 0F3h,0C3h endif __inner_loop_62 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_ct_inverse_mod_384 DD imagerel $L$SEH_body_ct_inverse_mod_384 DD imagerel $L$SEH_info_ct_inverse_mod_384_prologue DD imagerel $L$SEH_body_ct_inverse_mod_384 DD imagerel $L$SEH_epilogue_ct_inverse_mod_384 DD imagerel $L$SEH_info_ct_inverse_mod_384_body DD imagerel $L$SEH_epilogue_ct_inverse_mod_384 DD imagerel $L$SEH_end_ct_inverse_mod_384 DD imagerel $L$SEH_info_ct_inverse_mod_384_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_ct_inverse_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_ct_inverse_mod_384_body:: DB 1,0,18,0 DB 000h,0f4h,08bh,000h DB 000h,0e4h,08ch,000h DB 000h,0d4h,08dh,000h DB 000h,0c4h,08eh,000h DB 000h,034h,08fh,000h DB 000h,054h,090h,000h DB 000h,074h,092h,000h DB 000h,064h,093h,000h DB 000h,001h,091h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_ct_inverse_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/ctx_inverse_mod_384-x86_64.asm ================================================ OPTION DOTNAME PUBLIC ct_inverse_mod_384$1 .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC ctx_inverse_mod_384 ALIGN 32 ctx_inverse_mod_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ctx_inverse_mod_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ct_inverse_mod_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,1112 $L$SEH_body_ctx_inverse_mod_384:: lea rax,QWORD PTR[((88+511))+rsp] and rax,-512 mov QWORD PTR[32+rsp],rdi mov QWORD PTR[40+rsp],rcx ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[rdx] mov r15,QWORD PTR[8+rdx] mov rbx,QWORD PTR[16+rdx] mov rbp,QWORD PTR[24+rdx] mov rsi,QWORD PTR[32+rdx] mov rdi,QWORD PTR[40+rdx] mov QWORD PTR[rax],r8 mov QWORD PTR[8+rax],r9 mov QWORD PTR[16+rax],r10 mov QWORD PTR[24+rax],r11 mov QWORD PTR[32+rax],r12 mov QWORD PTR[40+rax],r13 mov QWORD PTR[48+rax],r14 mov QWORD PTR[56+rax],r15 mov QWORD PTR[64+rax],rbx mov QWORD PTR[72+rax],rbp mov QWORD PTR[80+rax],rsi mov rsi,rax mov QWORD PTR[88+rax],rdi mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[96+rdi],rdx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[104+rdi],rdx xor rsi,256 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov rax,QWORD PTR[96+rsi] mov r11,QWORD PTR[152+rsi] mov rbx,rdx mov r10,rax imul QWORD PTR[56+rsp] mov r8,rax mov rax,r11 mov r9,rdx imul QWORD PTR[64+rsp] add r8,rax adc r9,rdx mov QWORD PTR[48+rdi],r8 mov QWORD PTR[56+rdi],r9 sar r9,63 mov QWORD PTR[64+rdi],r9 mov QWORD PTR[72+rdi],r9 mov QWORD PTR[80+rdi],r9 mov QWORD PTR[88+rdi],r9 mov QWORD PTR[96+rdi],r9 lea rsi,QWORD PTR[96+rsi] mov rax,r10 imul rbx mov r8,rax mov rax,r11 mov r9,rdx imul rcx add r8,rax adc r9,rdx mov QWORD PTR[104+rdi],r8 mov QWORD PTR[112+rdi],r9 sar r9,63 mov QWORD PTR[120+rdi],r9 mov QWORD PTR[128+rdi],r9 mov QWORD PTR[136+rdi],r9 mov QWORD PTR[144+rdi],r9 mov QWORD PTR[152+rdi],r9 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_384x63 mov QWORD PTR[56+rdi],r14 mov QWORD PTR[64+rdi],r14 mov QWORD PTR[72+rdi],r14 mov QWORD PTR[80+rdi],r14 mov QWORD PTR[88+rdi],r14 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_384_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_384_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_191_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_191_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_191_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_191_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_191_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_191_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,31 call __ab_approximation_31 mov QWORD PTR[72+rsp],r12 mov QWORD PTR[80+rsp],r13 mov rdi,256 xor rdi,rsi call __smulx_191_n_shift_by_31 mov QWORD PTR[56+rsp],rdx mov QWORD PTR[64+rsp],rcx mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[48+rdi] call __smulx_191_n_shift_by_31 mov QWORD PTR[72+rsp],rdx mov QWORD PTR[80+rsp],rcx mov rdx,QWORD PTR[56+rsp] mov rcx,QWORD PTR[64+rsp] lea rsi,QWORD PTR[96+rsi] lea rdi,QWORD PTR[48+rdi] call __smulx_384x63 mov rdx,QWORD PTR[72+rsp] mov rcx,QWORD PTR[80+rsp] lea rdi,QWORD PTR[56+rdi] call __smulx_768x63 xor rsi,256+8*12 mov edi,55 mov r8,QWORD PTR[rsi] mov r10,QWORD PTR[48+rsi] call __tail_loop_55 lea rsi,QWORD PTR[96+rsi] mov rdx,r12 mov rcx,r13 mov rdi,QWORD PTR[32+rsp] call __smulx_768x63 mov rsi,QWORD PTR[40+rsp] mov r13,rdx sar r13,63 mov r8,r13 mov r9,r13 mov r10,r13 ifdef __SGX_LVI_HARDENING__ lfence endif and r8,QWORD PTR[rsi] and r9,QWORD PTR[8+rsi] mov r11,r13 and r10,QWORD PTR[16+rsi] and r11,QWORD PTR[24+rsi] mov r12,r13 and r12,QWORD PTR[32+rsi] and r13,QWORD PTR[40+rsi] add r14,r8 adc r15,r9 adc rbx,r10 adc rbp,r11 adc rcx,r12 adc rax,r13 adc rdx,0 mov r13,rdx neg rdx or r13,rdx sar rdx,63 mov r8,r13 mov r9,r13 mov r10,r13 and r8,QWORD PTR[rsi] and r9,QWORD PTR[8+rsi] mov r11,r13 and r10,QWORD PTR[16+rsi] and r11,QWORD PTR[24+rsi] mov r12,r13 and r12,QWORD PTR[32+rsi] and r13,QWORD PTR[40+rsi] xor r8,rdx xor rsi,rsi xor r9,rdx sub rsi,rdx xor r10,rdx xor r11,rdx xor r12,rdx xor r13,rdx add r8,rsi adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 add r14,r8 adc r15,r9 adc rbx,r10 adc rbp,r11 adc rcx,r12 adc rax,r13 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 mov QWORD PTR[64+rdi],rbx mov QWORD PTR[72+rdi],rbp mov QWORD PTR[80+rdi],rcx mov QWORD PTR[88+rdi],rax lea r8,QWORD PTR[1112+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_ctx_inverse_mod_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_ctx_inverse_mod_384:: ctx_inverse_mod_384 ENDP ALIGN 32 __smulx_768x63 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] mov rax,rdx sar rax,63 xor rbp,rbp sub rbp,rax mov QWORD PTR[8+rsp],rdi mov QWORD PTR[16+rsp],rsi lea rsi,QWORD PTR[56+rsi] xor rdx,rax add rdx,rbp xor r8,rax xor r9,rax xor r10,rax xor r11,rax xor r12,rax xor r13,rax xor r14,rax add r8,rbp adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 and r14,rdx neg r14 mulx rbp,r8,r8 mulx rax,r9,r9 add r9,rbp mulx rbp,r10,r10 adc r10,rax mulx rax,r11,r11 adc r11,rbp mulx rbp,r12,r12 adc r12,rax mulx rax,r13,r13 adc r13,rbp adc r14,rax mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 sar r14,63 mov QWORD PTR[56+rdi],r14 mov rdx,rcx mov rax,rcx mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] mov r15,QWORD PTR[56+rsi] mov rbx,QWORD PTR[64+rsi] mov rbp,QWORD PTR[72+rsi] mov rcx,QWORD PTR[80+rsi] mov rdi,QWORD PTR[88+rsi] sar rax,63 xor rsi,rsi sub rsi,rax xor rdx,rax add rdx,rsi xor r8,rax xor r9,rax xor r10,rax xor r11,rax xor r12,rax xor r13,rax xor r14,rax xor r15,rax xor rbx,rax xor rbp,rax xor rcx,rax xor rax,rdi add r8,rsi adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 adc r15,0 adc rbx,0 adc rbp,0 adc rcx,0 adc rax,0 mulx rsi,r8,r8 mulx rdi,r9,r9 add r9,rsi mulx rsi,r10,r10 adc r10,rdi mulx rdi,r11,r11 adc r11,rsi mulx rsi,r12,r12 adc r12,rdi mulx rdi,r13,r13 adc r13,rsi mulx rsi,r14,r14 adc r14,rdi mulx rdi,r15,r15 adc r15,rsi mulx rsi,rbx,rbx adc rbx,rdi mulx rdi,rbp,rbp adc rbp,rsi mulx rsi,rcx,rcx adc rcx,rdi mov rdi,QWORD PTR[8+rsp] adc rsi,0 imul rdx add rax,rsi adc rdx,0 add r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] adc r12,QWORD PTR[32+rdi] adc r13,QWORD PTR[40+rdi] adc r14,QWORD PTR[48+rdi] mov rsi,QWORD PTR[56+rdi] adc r15,rsi adc rbx,rsi adc rbp,rsi adc rcx,rsi adc rax,rsi adc rdx,rsi mov rsi,QWORD PTR[16+rsp] mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 mov QWORD PTR[64+rdi],rbx mov QWORD PTR[72+rdi],rbp mov QWORD PTR[80+rdi],rcx mov QWORD PTR[88+rdi],rax ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __smulx_768x63 ENDP ALIGN 32 __smulx_384x63 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] mov r10,QWORD PTR[((0+16))+rsi] mov r11,QWORD PTR[((0+24))+rsi] mov r12,QWORD PTR[((0+32))+rsi] mov r13,QWORD PTR[((0+40))+rsi] mov r14,QWORD PTR[((0+48))+rsi] mov rbp,rdx sar rbp,63 xor rax,rax sub rax,rbp xor rdx,rbp add rdx,rax xor r8,rbp xor r9,rbp xor r10,rbp xor r11,rbp xor r12,rbp xor r13,rbp xor r14,rbp add r8,rax adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 and r14,rdx neg r14 mulx rbp,r8,r8 mulx rax,r9,r9 add r9,rbp mulx rbp,r10,r10 adc r10,rax mulx rax,r11,r11 adc r11,rbp mulx rbp,r12,r12 adc r12,rax mulx rax,r13,r13 mov rdx,rcx adc r13,rbp adc r14,rax mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov r15,r13 mov rbx,r14 mov r8,QWORD PTR[((56+0))+rsi] mov r9,QWORD PTR[((56+8))+rsi] mov r10,QWORD PTR[((56+16))+rsi] mov r11,QWORD PTR[((56+24))+rsi] mov r12,QWORD PTR[((56+32))+rsi] mov r13,QWORD PTR[((56+40))+rsi] mov r14,QWORD PTR[((56+48))+rsi] mov rbp,rdx sar rbp,63 xor rax,rax sub rax,rbp xor rdx,rbp add rdx,rax xor r8,rbp xor r9,rbp xor r10,rbp xor r11,rbp xor r12,rbp xor r13,rbp xor r14,rbp add r8,rax adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 adc r14,0 and r14,rdx neg r14 mulx rbp,r8,r8 mulx rax,r9,r9 add r9,rbp mulx rbp,r10,r10 adc r10,rax mulx rax,r11,r11 adc r11,rbp mulx rbp,r12,r12 adc r12,rax mulx rax,r13,r13 adc r13,rbp adc r14,rax add r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] adc r12,QWORD PTR[32+rdi] adc r13,r15 adc r14,rbx mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov QWORD PTR[48+rdi],r14 ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __smulx_384x63 ENDP ALIGN 32 __smulx_384_n_shift_by_31 PROC PRIVATE DB 243,15,30,250 mov rbx,rdx mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] mov r10,QWORD PTR[((0+16))+rsi] mov r11,QWORD PTR[((0+24))+rsi] mov r12,QWORD PTR[((0+32))+rsi] mov r13,QWORD PTR[((0+40))+rsi] mov rax,rdx sar rax,63 xor rbp,rbp sub rbp,rax xor rdx,rax add rdx,rbp xor r8,rax xor r9,rax xor r10,rax xor r11,rax xor r12,rax xor r13,rax add r8,rbp adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 and rax,rdx neg rax mulx rbp,r8,r8 mulx r14,r9,r9 add r9,rbp mulx rbp,r10,r10 adc r10,r14 mulx r14,r11,r11 adc r11,rbp mulx rbp,r12,r12 adc r12,r14 mulx r14,r13,r13 adc r13,rbp adc r14,rax mov rdx,rcx mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 mov r15,r14 mov r8,QWORD PTR[((48+0))+rsi] mov r9,QWORD PTR[((48+8))+rsi] mov r10,QWORD PTR[((48+16))+rsi] mov r11,QWORD PTR[((48+24))+rsi] mov r12,QWORD PTR[((48+32))+rsi] mov r13,QWORD PTR[((48+40))+rsi] mov rax,rdx sar rax,63 xor rbp,rbp sub rbp,rax xor rdx,rax add rdx,rbp xor r8,rax xor r9,rax xor r10,rax xor r11,rax xor r12,rax xor r13,rax add r8,rbp adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 and rax,rdx neg rax mulx rbp,r8,r8 mulx r14,r9,r9 add r9,rbp mulx rbp,r10,r10 adc r10,r14 mulx r14,r11,r11 adc r11,rbp mulx rbp,r12,r12 adc r12,r14 mulx r14,r13,r13 adc r13,rbp adc r14,rax add r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] adc r12,QWORD PTR[32+rdi] adc r13,QWORD PTR[40+rdi] adc r14,r15 mov rdx,rbx shrd r8,r9,31 shrd r9,r10,31 shrd r10,r11,31 shrd r11,r12,31 shrd r12,r13,31 shrd r13,r14,31 sar r14,63 xor rbp,rbp sub rbp,r14 xor r8,r14 xor r9,r14 xor r10,r14 xor r11,r14 xor r12,r14 xor r13,r14 add r8,rbp adc r9,0 adc r10,0 adc r11,0 adc r12,0 adc r13,0 mov QWORD PTR[rdi],r8 mov QWORD PTR[8+rdi],r9 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 xor rdx,r14 xor rcx,r14 add rdx,rbp add rcx,rbp ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __smulx_384_n_shift_by_31 ENDP ALIGN 32 __smulx_191_n_shift_by_31 PROC PRIVATE DB 243,15,30,250 mov rbx,rdx mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] mov r10,QWORD PTR[((0+16))+rsi] mov rax,rdx sar rax,63 xor rbp,rbp sub rbp,rax xor rdx,rax add rdx,rbp xor r8,rax xor r9,rax xor rax,r10 add r8,rbp adc r9,0 adc rax,0 mulx rbp,r8,r8 mulx r10,r9,r9 add r9,rbp adc r10,0 imul rdx add r10,rax adc rdx,0 mov r14,rdx mov rdx,rcx mov r11,QWORD PTR[((48+0))+rsi] mov r12,QWORD PTR[((48+8))+rsi] mov r13,QWORD PTR[((48+16))+rsi] mov rax,rdx sar rax,63 xor rbp,rbp sub rbp,rax xor rdx,rax add rdx,rbp xor r11,rax xor r12,rax xor rax,r13 add r11,rbp adc r12,0 adc rax,0 mulx rbp,r11,r11 mulx r13,r12,r12 add r12,rbp adc r13,0 imul rdx add r13,rax adc rdx,0 add r11,r8 adc r12,r9 adc r13,r10 adc r14,rdx mov rdx,rbx shrd r11,r12,31 shrd r12,r13,31 shrd r13,r14,31 sar r14,63 xor rbp,rbp sub rbp,r14 xor r11,r14 xor r12,r14 xor r13,r14 add r11,rbp adc r12,0 adc r13,0 mov QWORD PTR[rdi],r11 mov QWORD PTR[8+rdi],r12 mov QWORD PTR[16+rdi],r13 xor rdx,r14 xor rcx,r14 add rdx,rbp add rcx,rbp ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __smulx_191_n_shift_by_31 ENDP ALIGN 32 __ab_approximation_31 PROC PRIVATE DB 243,15,30,250 mov r9,QWORD PTR[40+rsi] mov r11,QWORD PTR[88+rsi] mov rbx,QWORD PTR[32+rsi] mov rbp,QWORD PTR[80+rsi] mov r8,QWORD PTR[24+rsi] mov r10,QWORD PTR[72+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 mov r8,QWORD PTR[16+rsi] cmovz rbp,r10 mov r10,QWORD PTR[64+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 mov r8,QWORD PTR[8+rsi] cmovz rbp,r10 mov r10,QWORD PTR[56+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 mov r8,QWORD PTR[rsi] cmovz rbp,r10 mov r10,QWORD PTR[48+rsi] mov rax,r9 or rax,r11 cmovz r9,rbx cmovz r11,rbp cmovz rbx,r8 cmovz rbp,r10 mov rax,r9 or rax,r11 bsr rcx,rax lea rcx,QWORD PTR[1+rcx] cmovz r9,r8 cmovz r11,r10 cmovz rcx,rax neg rcx shld r9,rbx,cl shld r11,rbp,cl mov eax,07FFFFFFFh and r8,rax and r10,rax andn r9,rax,r9 andn r11,rax,r11 or r8,r9 or r10,r11 jmp __inner_loop_31 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __ab_approximation_31 ENDP ALIGN 32 __inner_loop_31 PROC PRIVATE DB 243,15,30,250 mov rcx,07FFFFFFF80000000h mov r13,0800000007FFFFFFFh mov r15,07FFFFFFF7FFFFFFFh $L$oop_31:: cmp r8,r10 mov rax,r8 mov rbx,r10 mov rbp,rcx mov r14,r13 cmovb r8,r10 cmovb r10,rax cmovb rcx,r13 cmovb r13,rbp sub r8,r10 sub rcx,r13 add rcx,r15 test rax,1 cmovz r8,rax cmovz r10,rbx cmovz rcx,rbp cmovz r13,r14 shr r8,1 add r13,r13 sub r13,r15 sub edi,1 jnz $L$oop_31 shr r15,32 mov edx,ecx mov r12d,r13d shr rcx,32 shr r13,32 sub rdx,r15 sub rcx,r15 sub r12,r15 sub r13,r15 ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __inner_loop_31 ENDP ALIGN 32 __tail_loop_55 PROC PRIVATE DB 243,15,30,250 mov rdx,1 xor rcx,rcx xor r12,r12 mov r13,1 $L$oop_55:: xor rax,rax test r8,1 mov rbx,r10 cmovnz rax,r10 sub rbx,r8 mov rbp,r8 sub r8,rax cmovc r8,rbx cmovc r10,rbp mov rax,rdx cmovc rdx,r12 cmovc r12,rax mov rbx,rcx cmovc rcx,r13 cmovc r13,rbx xor rax,rax xor rbx,rbx shr r8,1 test rbp,1 cmovnz rax,r12 cmovnz rbx,r13 add r12,r12 add r13,r13 sub rdx,rax sub rcx,rbx sub edi,1 jnz $L$oop_55 ifdef __SGX_LVI_HARDENING__ pop r8 lfence jmp r8 ud2 else DB 0F3h,0C3h endif __tail_loop_55 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_ctx_inverse_mod_384 DD imagerel $L$SEH_body_ctx_inverse_mod_384 DD imagerel $L$SEH_info_ctx_inverse_mod_384_prologue DD imagerel $L$SEH_body_ctx_inverse_mod_384 DD imagerel $L$SEH_epilogue_ctx_inverse_mod_384 DD imagerel $L$SEH_info_ctx_inverse_mod_384_body DD imagerel $L$SEH_epilogue_ctx_inverse_mod_384 DD imagerel $L$SEH_end_ctx_inverse_mod_384 DD imagerel $L$SEH_info_ctx_inverse_mod_384_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_ctx_inverse_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_ctx_inverse_mod_384_body:: DB 1,0,18,0 DB 000h,0f4h,08bh,000h DB 000h,0e4h,08ch,000h DB 000h,0d4h,08dh,000h DB 000h,0c4h,08eh,000h DB 000h,034h,08fh,000h DB 000h,054h,090h,000h DB 000h,074h,092h,000h DB 000h,064h,093h,000h DB 000h,001h,091h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_ctx_inverse_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/div3w-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |div_3_limbs|[FUNC] ALIGN 32 |div_3_limbs| PROC hint #34 ldp x4,x5,[x0] eor x0,x0,x0 mov x3,#64 nop |$Loop| subs x6,x4,x1 add x0,x0,x0 sbcs x7,x5,x2 add x0,x0,#1 csello x4,x4,x6 extr x1,x2,x1,#1 csello x5,x5,x7 lsr x2,x2,#1 sbc x0,x0,xzr sub x3,x3,#1 cbnz x3,|$Loop| asr x3,x0,#63 add x0,x0,x0 subs x6,x4,x1 add x0,x0,#1 sbcs x7,x5,x2 sbc x0,x0,xzr orr x0,x0,x3 ret ENDP EXPORT |quot_rem_128|[FUNC] ALIGN 32 |quot_rem_128| PROC hint #34 ldp x3,x4,[x1] mul x5,x3,x2 umulh x6,x3,x2 mul x11, x4,x2 umulh x7,x4,x2 ldp x8,x9,[x0] ldr x10,[x0,#16] adds x6,x6,x11 adc x7,x7,xzr subs x8,x8,x5 sbcs x9,x9,x6 sbcs x10,x10,x7 sbc x5,xzr,xzr add x2,x2,x5 and x3,x3,x5 and x4,x4,x5 adds x8,x8,x3 adc x9,x9,x4 stp x8,x9,[x0] str x2,[x0,#16] mov x0,x2 ret ENDP EXPORT |quot_rem_64|[FUNC] ALIGN 32 |quot_rem_64| PROC hint #34 ldr x3,[x1] ldr x8,[x0] mul x5,x3,x2 sub x8,x8,x5 stp x8,x2,[x0] mov x0,x2 ret ENDP END ================================================ FILE: build/win64/div3w-x86_64.asm ================================================ OPTION DOTNAME .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC div_3_limbs ALIGN 32 div_3_limbs PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_div_3_limbs:: mov rdi,rcx mov rsi,rdx mov rdx,r8 $L$SEH_body_div_3_limbs:: ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] xor rax,rax mov ecx,64 $L$oop:: mov r10,r8 sub r8,rsi mov r11,r9 sbb r9,rdx lea rax,QWORD PTR[1+rax*1+rax] mov rdi,rdx cmovc r8,r10 cmovc r9,r11 sbb rax,0 shl rdi,63 shr rsi,1 shr rdx,1 or rsi,rdi sub ecx,1 jnz $L$oop lea rcx,QWORD PTR[1+rax*1+rax] sar rax,63 sub r8,rsi sbb r9,rdx sbb rcx,0 or rax,rcx $L$SEH_epilogue_div_3_limbs:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_div_3_limbs:: div_3_limbs ENDP PUBLIC quot_rem_128 ALIGN 32 quot_rem_128 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_quot_rem_128:: mov rdi,rcx mov rsi,rdx mov rdx,r8 $L$SEH_body_quot_rem_128:: ifdef __SGX_LVI_HARDENING__ lfence endif mov rax,rdx mov rcx,rdx mul QWORD PTR[rsi] mov r8,rax mov rax,rcx mov r9,rdx mul QWORD PTR[8+rsi] add r9,rax adc rdx,0 mov r10,QWORD PTR[rdi] mov r11,QWORD PTR[8+rdi] mov rax,QWORD PTR[16+rdi] sub r10,r8 sbb r11,r9 sbb rax,rdx sbb r8,r8 add rcx,r8 mov r9,r8 and r8,QWORD PTR[rsi] and r9,QWORD PTR[8+rsi] add r10,r8 adc r11,r9 mov QWORD PTR[rdi],r10 mov QWORD PTR[8+rdi],r11 mov QWORD PTR[16+rdi],rcx mov rax,rcx $L$SEH_epilogue_quot_rem_128:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_quot_rem_128:: quot_rem_128 ENDP PUBLIC quot_rem_64 ALIGN 32 quot_rem_64 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_quot_rem_64:: mov rdi,rcx mov rsi,rdx mov rdx,r8 $L$SEH_body_quot_rem_64:: ifdef __SGX_LVI_HARDENING__ lfence endif mov rax,rdx imul rdx,QWORD PTR[rsi] mov r10,QWORD PTR[rdi] sub r10,rdx mov QWORD PTR[rdi],r10 mov QWORD PTR[8+rdi],rax $L$SEH_epilogue_quot_rem_64:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_quot_rem_64:: quot_rem_64 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_div_3_limbs DD imagerel $L$SEH_body_div_3_limbs DD imagerel $L$SEH_info_div_3_limbs_prologue DD imagerel $L$SEH_body_div_3_limbs DD imagerel $L$SEH_epilogue_div_3_limbs DD imagerel $L$SEH_info_div_3_limbs_body DD imagerel $L$SEH_epilogue_div_3_limbs DD imagerel $L$SEH_end_div_3_limbs DD imagerel $L$SEH_info_div_3_limbs_epilogue DD imagerel $L$SEH_begin_quot_rem_128 DD imagerel $L$SEH_body_quot_rem_128 DD imagerel $L$SEH_info_quot_rem_128_prologue DD imagerel $L$SEH_body_quot_rem_128 DD imagerel $L$SEH_epilogue_quot_rem_128 DD imagerel $L$SEH_info_quot_rem_128_body DD imagerel $L$SEH_epilogue_quot_rem_128 DD imagerel $L$SEH_end_quot_rem_128 DD imagerel $L$SEH_info_quot_rem_128_epilogue DD imagerel $L$SEH_begin_quot_rem_64 DD imagerel $L$SEH_body_quot_rem_64 DD imagerel $L$SEH_info_quot_rem_64_prologue DD imagerel $L$SEH_body_quot_rem_64 DD imagerel $L$SEH_epilogue_quot_rem_64 DD imagerel $L$SEH_info_quot_rem_64_body DD imagerel $L$SEH_epilogue_quot_rem_64 DD imagerel $L$SEH_end_quot_rem_64 DD imagerel $L$SEH_info_quot_rem_64_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_div_3_limbs_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_div_3_limbs_body:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_div_3_limbs_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_quot_rem_128_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_quot_rem_128_body:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_quot_rem_128_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_quot_rem_64_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_quot_rem_64_body:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_quot_rem_64_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/dll.c ================================================ #include #if defined(_MSC_VER) /* * Even though we don't have memcpy/memset anywhere, MSVC compiler * generates calls to them as it recognizes corresponding patterns. */ void *memcpy(unsigned char *dst, const unsigned char *src, size_t n) { void *ret = dst; while(n--) *dst++ = *src++; return ret; } void *memset(unsigned char *dst, int c, size_t n) { void *ret = dst; while(n--) *dst++ = (unsigned char)c; return ret; } #elif defined(__GNUC__) # pragma GCC diagnostic ignored "-Wunused-parameter" #endif BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) { return TRUE; } ================================================ FILE: build/win64/mul_mont_256-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |mul_mont_sparse_256|[FUNC] ALIGN 32 |mul_mont_sparse_256| PROC hint #34 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x10,x11,[x1] ldr x9, [x2] ldp x12,x13,[x1,#16] mul x19,x10,x9 ldp x5,x6,[x3] mul x20,x11,x9 ldp x7,x8,[x3,#16] mul x21,x12,x9 mul x22,x13,x9 umulh x14,x10,x9 umulh x15,x11,x9 mul x3,x4,x19 umulh x16,x12,x9 umulh x17,x13,x9 adds x20,x20,x14 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,xzr, x17 mul x17,x8,x3 ldr x9,[x2,8*1] subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*2] subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*3] subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 mul x14,x10,x9 adcs x20,x21,x15 mul x15,x11,x9 adcs x21,x22,x16 mul x16,x12,x9 adcs x22,x23,x17 mul x17,x13,x9 adc x23,xzr,xzr adds x19,x19,x14 umulh x14,x10,x9 adcs x20,x20,x15 umulh x15,x11,x9 adcs x21,x21,x16 mul x3,x4,x19 umulh x16,x12,x9 adcs x22,x22,x17 umulh x17,x13,x9 adc x23,x23,xzr adds x20,x20,x14 adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 adcs x21,x21,x16 umulh x16,x7,x3 adcs x22,x22,x17 umulh x17,x8,x3 adc x23,x23,xzr adds x19,x20,x14 adcs x20,x21,x15 adcs x21,x22,x16 adcs x22,x23,x17 adc x23,xzr,xzr subs x14,x19,x5 sbcs x15,x20,x6 sbcs x16,x21,x7 sbcs x17,x22,x8 sbcs xzr, x23,xzr csello x19,x19,x14 csello x20,x20,x15 csello x21,x21,x16 csello x22,x22,x17 stp x19,x20,[x0] stp x21,x22,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ ret ENDP EXPORT |sqr_mont_sparse_256|[FUNC] ALIGN 32 |sqr_mont_sparse_256| PROC hint #25 stp x29,x30,[sp,#-6*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] ldp x5,x6,[x1] ldp x7,x8,[x1,#16] mov x4,x3 mul x11,x6,x5 umulh x15,x6,x5 mul x12,x7,x5 umulh x16,x7,x5 mul x13,x8,x5 umulh x19,x8,x5 adds x12,x12,x15 mul x14,x7,x6 umulh x15,x7,x6 adcs x13,x13,x16 mul x16,x8,x6 umulh x17,x8,x6 adc x19,x19,xzr mul x20,x8,x7 umulh x21,x8,x7 adds x15,x15,x16 mul x10,x5,x5 adc x16,x17,xzr adds x13,x13,x14 umulh x5,x5,x5 adcs x19,x19,x15 mul x15,x6,x6 adcs x20,x20,x16 umulh x6,x6,x6 adc x21,x21,xzr adds x11,x11,x11 mul x16,x7,x7 adcs x12,x12,x12 umulh x7,x7,x7 adcs x13,x13,x13 mul x17,x8,x8 adcs x19,x19,x19 umulh x8,x8,x8 adcs x20,x20,x20 adcs x21,x21,x21 adc x22,xzr,xzr adds x11,x11,x5 adcs x12,x12,x15 adcs x13,x13,x6 adcs x19,x19,x16 adcs x20,x20,x7 adcs x21,x21,x17 adc x22,x22,x8 bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] adds x10,x10,x19 adcs x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adc x19,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x19,xzr csello x10,x10,x14 csello x11,x11,x15 csello x12,x12,x16 csello x13,x13,x17 stp x10,x11,[x0] stp x12,x13,[x0,#16] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldr x29,[sp],#6*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |from_mont_256|[FUNC] ALIGN 32 |from_mont_256| PROC hint #25 stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 csello x10,x10,x14 csello x11,x11,x15 csello x12,x12,x16 csello x13,x13,x17 stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |redc_mont_256|[FUNC] ALIGN 32 |redc_mont_256| PROC hint #25 stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 mov x4,x3 ldp x10,x11,[x1] ldp x12,x13,[x1,#16] bl __mul_by_1_mont_256 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x14,x15,[x1,#32] ldp x16,x17,[x1,#48] adds x10,x10,x14 adcs x11,x11,x15 adcs x12,x12,x16 adcs x13,x13,x17 adc x9,xzr,xzr subs x14,x10,x5 sbcs x15,x11,x6 sbcs x16,x12,x7 sbcs x17,x13,x8 sbcs xzr, x9,xzr csello x10,x10,x14 csello x11,x11,x15 csello x12,x12,x16 csello x13,x13,x17 stp x10,x11,[x0] stp x12,x13,[x0,#16] ldr x29,[sp],#2*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__mul_by_1_mont_256| PROC mul x3,x4,x10 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 adcs x12,x12,x16 umulh x16,x7,x3 adcs x13,x13,x17 umulh x17,x8,x3 adc x9,xzr,xzr adds x10,x11,x14 adcs x11,x12,x15 adcs x12,x13,x16 adc x13,x9,x17 ret ENDP END ================================================ FILE: build/win64/mul_mont_384-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 AREA |.text|,CODE,ALIGN=8,ARM64 EXPORT |add_mod_384x384|[FUNC] ALIGN 32 |add_mod_384x384| PROC hint #25 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __add_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__add_mod_384x384| PROC ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 stp x11, x12, [x0] adcs x15,x15,x23 ldp x11, x12, [x1,#48] adcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] adcs x11,x11,x19 stp x15, x16, [x0,#32] adcs x12,x12,x20 ldp x15, x16, [x1,#80] adcs x13,x13,x21 ldp x23,x24,[x2,#80] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csello x11,x11,x19 csello x12,x12,x20 csello x13,x13,x21 csello x14,x14,x22 stp x11,x12,[x0,#48] csello x15,x15,x23 stp x13,x14,[x0,#64] csello x16,x16,x24 stp x15,x16,[x0,#80] ret ENDP EXPORT |sub_mod_384x384|[FUNC] ALIGN 32 |sub_mod_384x384| PROC hint #25 stp x29,x30,[sp,#-8*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldr x29,[sp],#8*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__sub_mod_384x384| PROC ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 stp x11, x12, [x0] sbcs x15,x15,x23 ldp x11, x12, [x1,#48] sbcs x16,x16,x24 ldp x19,x20,[x2,#48] stp x13, x14, [x0,#16] ldp x13, x14, [x1,#64] ldp x21,x22,[x2,#64] sbcs x11,x11,x19 stp x15, x16, [x0,#32] sbcs x12,x12,x20 ldp x15, x16, [x1,#80] sbcs x13,x13,x21 ldp x23,x24,[x2,#80] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] ret ENDP ALIGN 32 |__add_mod_384| PROC ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] adds x11,x11,x19 ldp x21,x22,[x2,#16] adcs x12,x12,x20 ldp x15, x16, [x1,#32] adcs x13,x13,x21 ldp x23,x24,[x2,#32] adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x17,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x17,xzr csello x11,x11,x19 csello x12,x12,x20 csello x13,x13,x21 csello x14,x14,x22 csello x15,x15,x23 stp x11,x12,[x0] csello x16,x16,x24 stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret ENDP ALIGN 32 |__sub_mod_384| PROC ldp x11, x12, [x1] ldp x19,x20,[x2] ldp x13, x14, [x1,#16] subs x11,x11,x19 ldp x21,x22,[x2,#16] sbcs x12,x12,x20 ldp x15, x16, [x1,#32] sbcs x13,x13,x21 ldp x23,x24,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x17,xzr,xzr and x19,x5,x17 and x20,x6,x17 adds x11,x11,x19 and x21,x7,x17 adcs x12,x12,x20 and x22,x8,x17 adcs x13,x13,x21 and x23,x9,x17 adcs x14,x14,x22 and x24,x10,x17 adcs x15,x15,x23 stp x11,x12,[x0] adc x16,x16,x24 stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret ENDP EXPORT |mul_mont_384x|[FUNC] ALIGN 32 |mul_mont_384x| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#288 mov x26,x0 mov x27,x1 mov x28,x2 add x0,sp,#0 bl __mul_384 add x1,x1,#48 add x2,x2,#48 add x0,sp,#96 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] sub x2,x1,#48 add x0,sp,#240 bl __add_mod_384 add x1,x28,#0 add x2,x28,#48 add x0,sp,#192 bl __add_mod_384 add x1,x0,#0 add x2,x0,#48 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,x0 add x2,sp,#0 bl __sub_mod_384x384 add x2,sp,#96 bl __sub_mod_384x384 add x1,sp,#0 add x2,sp,#96 add x0,sp,#0 bl __sub_mod_384x384 add x1,sp,#0 add x0,x26,#0 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 add x1,sp,#192 add x0,x0,#48 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#288 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |sqr_mont_384x|[FUNC] ALIGN 32 |sqr_mont_384x| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] sub sp,sp,#96 mov x4,x3 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 add x0,sp,#0 bl __add_mod_384 add x0,sp,#48 bl __sub_mod_384 ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __mul_mont_384 adds x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csello x19,x11,x19 csello x20,x12,x20 csello x21,x13,x21 ldp x11,x12,[sp] csello x22,x14,x22 ldr x17, [sp,#48] csello x23,x15,x23 ldp x13,x14,[sp,#16] csello x24,x16,x24 ldp x15,x16,[sp,#32] stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] add x2,sp,#48 bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |mul_mont_384|[FUNC] ALIGN 32 |mul_mont_384| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__mul_mont_384| PROC mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 mov x17,xzr subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*1] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*2] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*3] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*4] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr adc x4,x17,xzr ldr x17,[x2,8*5] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,x4,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adcs x25,x25,xzr adc x17,xzr,xzr adds x20,x20,x26 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] adc x17,x17,xzr adds x19,x20,x26 adcs x20,x21,x27 adcs x21,x22,x28 adcs x22,x23,x0 adcs x23,x24,x1 adcs x24,x25,x3 adc x25,x17,xzr subs x26,x19,x5 sbcs x27,x20,x6 sbcs x28,x21,x7 sbcs x0,x22,x8 sbcs x1,x23,x9 sbcs x3,x24,x10 sbcs xzr, x25,xzr csello x11,x19,x26 csello x12,x20,x27 csello x13,x21,x28 csello x14,x22,x0 csello x15,x23,x1 csello x16,x24,x3 ret ENDP EXPORT |sqr_mont_384|[FUNC] ALIGN 32 |sqr_mont_384| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 mov x4,x3 mov x3,x0 mov x0,sp ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] mov x1,sp mov x0,x3 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |sqr_n_mul_mont_383|[FUNC] ALIGN 32 |sqr_n_mul_mont_383| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x4,x0,[sp,#12*__SIZEOF_POINTER__] sub sp,sp,#96 mov x17,x5 ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mov x0,sp |$Loop_sqr_383| bl __sqr_384 sub x2,x2,#1 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] mov x1,sp bl __mul_by_1_mont_384 ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 cbnz x2,|$Loop_sqr_383| mov x2,x17 ldr x17,[x17] bl __mul_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__sqr_384| PROC mul x19,x12,x11 mul x20,x13,x11 mul x21,x14,x11 mul x22,x15,x11 mul x23,x16,x11 umulh x6,x12,x11 umulh x7,x13,x11 umulh x8,x14,x11 umulh x9,x15,x11 adds x20,x20,x6 umulh x10,x16,x11 adcs x21,x21,x7 mul x7,x13,x12 adcs x22,x22,x8 mul x8,x14,x12 adcs x23,x23,x9 mul x9,x15,x12 adc x24,xzr, x10 mul x10,x16,x12 adds x21,x21,x7 umulh x7,x13,x12 adcs x22,x22,x8 umulh x8,x14,x12 adcs x23,x23,x9 umulh x9,x15,x12 adcs x24,x24,x10 umulh x10,x16,x12 adc x25,xzr,xzr mul x5,x11,x11 adds x22,x22,x7 umulh x11, x11,x11 adcs x23,x23,x8 mul x8,x14,x13 adcs x24,x24,x9 mul x9,x15,x13 adc x25,x25,x10 mul x10,x16,x13 adds x23,x23,x8 umulh x8,x14,x13 adcs x24,x24,x9 umulh x9,x15,x13 adcs x25,x25,x10 umulh x10,x16,x13 adc x26,xzr,xzr mul x6,x12,x12 adds x24,x24,x8 umulh x12, x12,x12 adcs x25,x25,x9 mul x9,x15,x14 adc x26,x26,x10 mul x10,x16,x14 adds x25,x25,x9 umulh x9,x15,x14 adcs x26,x26,x10 umulh x10,x16,x14 adc x27,xzr,xzr mul x7,x13,x13 adds x26,x26,x9 umulh x13, x13,x13 adc x27,x27,x10 mul x8,x14,x14 mul x10,x16,x15 umulh x14, x14,x14 adds x27,x27,x10 umulh x10,x16,x15 mul x9,x15,x15 adc x28,x10,xzr adds x19,x19,x19 adcs x20,x20,x20 adcs x21,x21,x21 adcs x22,x22,x22 adcs x23,x23,x23 adcs x24,x24,x24 adcs x25,x25,x25 adcs x26,x26,x26 umulh x15, x15,x15 adcs x27,x27,x27 mul x10,x16,x16 adcs x28,x28,x28 umulh x16, x16,x16 adc x1,xzr,xzr adds x19,x19,x11 adcs x20,x20,x6 adcs x21,x21,x12 adcs x22,x22,x7 adcs x23,x23,x13 adcs x24,x24,x8 adcs x25,x25,x14 stp x5,x19,[x0] adcs x26,x26,x9 stp x20,x21,[x0,#16] adcs x27,x27,x15 stp x22,x23,[x0,#32] adcs x28,x28,x10 stp x24,x25,[x0,#48] adc x16,x16,x1 stp x26,x27,[x0,#64] stp x28,x16,[x0,#80] ret ENDP EXPORT |sqr_384|[FUNC] ALIGN 32 |sqr_384| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] bl __sqr_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |redc_mont_384|[FUNC] ALIGN 32 |redc_mont_384| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |from_mont_384|[FUNC] ALIGN 32 |from_mont_384| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x3 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 csello x11,x11,x19 csello x12,x12,x20 csello x13,x13,x21 csello x14,x14,x22 csello x15,x15,x23 csello x16,x16,x24 stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__mul_by_1_mont_384| PROC ldp x11,x12,[x1] ldp x13,x14,[x1,#16] mul x26,x4,x11 ldp x15,x16,[x1,#32] mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 mul x26,x4,x11 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 adcs x21,x21,x13 umulh x13,x7,x26 adcs x22,x22,x14 umulh x14,x8,x26 adcs x23,x23,x15 umulh x15,x9,x26 adcs x24,x24,x16 umulh x16,x10,x26 adc x25,xzr,xzr adds x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 adcs x14,x14,x23 adcs x15,x15,x24 adc x16,x16,x25 ret ENDP ALIGN 32 |__redc_tail_mont_384| PROC ldp x19,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adcs x16,x16,x24 adc x25,xzr,xzr subs x19,x11,x5 sbcs x20,x12,x6 sbcs x21,x13,x7 sbcs x22,x14,x8 sbcs x23,x15,x9 sbcs x24,x16,x10 sbcs xzr,x25,xzr csello x11,x11,x19 csello x12,x12,x20 csello x13,x13,x21 csello x14,x14,x22 csello x15,x15,x23 csello x16,x16,x24 stp x11,x12,[x0] stp x13,x14,[x0,#16] stp x15,x16,[x0,#32] ret ENDP EXPORT |mul_384|[FUNC] ALIGN 32 |mul_384| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__mul_384| PROC ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 umulh x5,x11,x17 umulh x6,x12,x17 umulh x7,x13,x17 umulh x8,x14,x17 umulh x9,x15,x17 umulh x10,x16,x17 ldr x17,[x2,8*1] str x19,[x0] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,xzr, x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(1+1)] adc x25,xzr,xzr str x19,[x0,8*1] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(2+1)] adc x25,xzr,xzr str x19,[x0,8*2] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(3+1)] adc x25,xzr,xzr str x19,[x0,8*3] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 ldr x17,[x2,#8*(4+1)] adc x25,xzr,xzr str x19,[x0,8*4] adds x19,x20,x5 mul x5,x11,x17 adcs x20,x21,x6 mul x6,x12,x17 adcs x21,x22,x7 mul x7,x13,x17 adcs x22,x23,x8 mul x8,x14,x17 adcs x23,x24,x9 mul x9,x15,x17 adc x24,x25,x10 mul x10,x16,x17 adds x19,x19,x5 umulh x5,x11,x17 adcs x20,x20,x6 umulh x6,x12,x17 adcs x21,x21,x7 umulh x7,x13,x17 adcs x22,x22,x8 umulh x8,x14,x17 adcs x23,x23,x9 umulh x9,x15,x17 adcs x24,x24,x10 umulh x10,x16,x17 adc x25,xzr,xzr str x19,[x0,8*5] adds x19,x20,x5 adcs x20,x21,x6 adcs x21,x22,x7 adcs x22,x23,x8 adcs x23,x24,x9 adc x24,x25,x10 stp x19,x20,[x0,#48] stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ret ENDP EXPORT |mul_382x|[FUNC] ALIGN 32 |mul_382x| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] sub sp,sp,#96 ldp x11,x12,[x1] mov x26,x0 ldp x19,x20,[x1,#48] mov x27,x1 ldp x13,x14,[x1,#16] mov x28,x2 ldp x21,x22,[x1,#64] ldp x15,x16,[x1,#32] adds x5,x11,x19 ldp x23,x24,[x1,#80] adcs x6,x12,x20 ldp x11,x12,[x2] adcs x7,x13,x21 ldp x19,x20,[x2,#48] adcs x8,x14,x22 ldp x13,x14,[x2,#16] adcs x9,x15,x23 ldp x21,x22,[x2,#64] adc x10,x16,x24 ldp x15,x16,[x2,#32] stp x5,x6,[sp] adds x5,x11,x19 ldp x23,x24,[x2,#80] adcs x6,x12,x20 stp x7,x8,[sp,#16] adcs x7,x13,x21 adcs x8,x14,x22 stp x9,x10,[sp,#32] adcs x9,x15,x23 stp x5,x6,[sp,#48] adc x10,x16,x24 stp x7,x8,[sp,#64] stp x9,x10,[sp,#80] bl __mul_384 add x1,sp,#0 add x2,sp,#48 add x0,x26,#96 bl __mul_384 add x1,x27,#48 add x2,x28,#48 add x0,sp,#0 bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] add x1,x26,#96 add x2,sp,#0 add x0,x26,#96 bl __sub_mod_384x384 add x2,x26,#0 bl __sub_mod_384x384 add x1,x26,#0 add x2,sp,#0 add x0,x26,#0 bl __sub_mod_384x384 ldr x30,[x29,#__SIZEOF_POINTER__] add sp,sp,#96 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |sqr_382x|[FUNC] ALIGN 32 |sqr_382x| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] ldp x11,x12,[x1] ldp x19,x20,[x1,#48] ldp x13,x14,[x1,#16] adds x5,x11,x19 ldp x21,x22,[x1,#64] adcs x6,x12,x20 ldp x15,x16,[x1,#32] adcs x7,x13,x21 ldp x23,x24,[x1,#80] adcs x8,x14,x22 stp x5,x6,[x0] adcs x9,x15,x23 ldp x5,x6,[x2] adc x10,x16,x24 stp x7,x8,[x0,#16] subs x11,x11,x19 ldp x7,x8,[x2,#16] sbcs x12,x12,x20 stp x9,x10,[x0,#32] sbcs x13,x13,x21 ldp x9,x10,[x2,#32] sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 adds x11,x11,x19 and x21,x7,x25 adcs x12,x12,x20 and x22,x8,x25 adcs x13,x13,x21 and x23,x9,x25 adcs x14,x14,x22 and x24,x10,x25 adcs x15,x15,x23 stp x11,x12,[x0,#48] adc x16,x16,x24 stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] mov x4,x1 add x1,x0,#0 add x2,x0,#48 bl __mul_384 add x1,x4,#0 add x2,x4,#48 add x0,x0,#96 bl __mul_384 ldr x30,[x29,#__SIZEOF_POINTER__] ldp x11,x12,[x0] ldp x13,x14,[x0,#16] adds x11,x11,x11 ldp x15,x16,[x0,#32] adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adcs x19,x19,x19 adcs x20,x20,x20 stp x11,x12,[x0] adcs x21,x21,x21 stp x13,x14,[x0,#16] adcs x22,x22,x22 stp x15,x16,[x0,#32] adcs x23,x23,x23 stp x19,x20,[x0,#48] adc x24,x24,x24 stp x21,x22,[x0,#64] stp x23,x24,[x0,#80] ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |sqr_mont_382x|[FUNC] ALIGN 32 |sqr_mont_382x| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] stp x3,x0,[sp,#12*__SIZEOF_POINTER__] sub sp,sp,#112 mov x4,x3 ldp x11,x12,[x1] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] ldp x17,x20,[x1,#48] ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] adds x5,x11,x17 adcs x6,x12,x20 adcs x7,x13,x21 adcs x8,x14,x22 adcs x9,x15,x23 adc x10,x16,x24 subs x19,x11,x17 sbcs x20,x12,x20 sbcs x21,x13,x21 sbcs x22,x14,x22 sbcs x23,x15,x23 sbcs x24,x16,x24 sbc x25,xzr,xzr stp x5,x6,[sp] stp x7,x8,[sp,#16] stp x9,x10,[sp,#32] stp x19,x20,[sp,#48] stp x21,x22,[sp,#64] stp x23,x24,[sp,#80] str x25,[sp,#96] ldp x5,x6,[x2] ldp x7,x8,[x2,#16] ldp x9,x10,[x2,#32] add x2,x1,#48 bl __mul_mont_383_nonred adds x19,x11,x11 adcs x20,x12,x12 adcs x21,x13,x13 adcs x22,x14,x14 adcs x23,x15,x15 adc x24,x16,x16 stp x19,x20,[x2,#48] stp x21,x22,[x2,#64] stp x23,x24,[x2,#80] ldp x11,x12,[sp] ldr x17,[sp,#48] ldp x13,x14,[sp,#16] ldp x15,x16,[sp,#32] add x2,sp,#48 bl __mul_mont_383_nonred ldr x30,[x29,#__SIZEOF_POINTER__] ldr x25,[sp,#96] ldp x19,x20,[sp] ldp x21,x22,[sp,#16] ldp x23,x24,[sp,#32] and x19,x19,x25 and x20,x20,x25 and x21,x21,x25 and x22,x22,x25 and x23,x23,x25 and x24,x24,x25 subs x11,x11,x19 sbcs x12,x12,x20 sbcs x13,x13,x21 sbcs x14,x14,x22 sbcs x15,x15,x23 sbcs x16,x16,x24 sbc x25,xzr,xzr and x19,x5,x25 and x20,x6,x25 and x21,x7,x25 and x22,x8,x25 and x23,x9,x25 and x24,x10,x25 adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 adcs x15,x15,x23 adc x16,x16,x24 stp x11,x12,[x2] stp x13,x14,[x2,#16] stp x15,x16,[x2,#32] add sp,sp,#112 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP ALIGN 32 |__mul_mont_383_nonred| PROC mul x19,x11,x17 mul x20,x12,x17 mul x21,x13,x17 mul x22,x14,x17 mul x23,x15,x17 mul x24,x16,x17 mul x4,x4,x19 umulh x26,x11,x17 umulh x27,x12,x17 umulh x28,x13,x17 umulh x0,x14,x17 umulh x1,x15,x17 umulh x3,x16,x17 adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,xzr, x3 mul x3,x10,x4 ldr x17,[x2,8*1] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*2] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*3] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*4] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 ldr x17,[x2,8*5] adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldr x4,[x29,#12*__SIZEOF_POINTER__] adds x19,x20,x26 mul x26,x11,x17 adcs x20,x21,x27 mul x27,x12,x17 adcs x21,x22,x28 mul x28,x13,x17 adcs x22,x23,x0 mul x0,x14,x17 adcs x23,x24,x1 mul x1,x15,x17 adcs x24,x25,x3 mul x3,x16,x17 adc x25,xzr,xzr adds x19,x19,x26 umulh x26,x11,x17 adcs x20,x20,x27 umulh x27,x12,x17 adcs x21,x21,x28 mul x4,x4,x19 umulh x28,x13,x17 adcs x22,x22,x0 umulh x0,x14,x17 adcs x23,x23,x1 umulh x1,x15,x17 adcs x24,x24,x3 umulh x3,x16,x17 adc x25,x25,xzr adds x20,x20,x26 mul x26,x5,x4 adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 mul x28,x7,x4 adcs x23,x23,x0 mul x0,x8,x4 adcs x24,x24,x1 mul x1,x9,x4 adc x25,x25,x3 mul x3,x10,x4 adds x19,x19,x26 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 adcs x21,x21,x28 umulh x28,x7,x4 adcs x22,x22,x0 umulh x0,x8,x4 adcs x23,x23,x1 umulh x1,x9,x4 adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr ldp x4,x2,[x29,#12*__SIZEOF_POINTER__] adds x11,x20,x26 adcs x12,x21,x27 adcs x13,x22,x28 adcs x14,x23,x0 adcs x15,x24,x1 adcs x16,x25,x3 ret ENDP EXPORT |sgn0_pty_mont_384|[FUNC] ALIGN 32 |sgn0_pty_mont_384| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 adds x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP EXPORT |sgn0_pty_mont_384x|[FUNC] ALIGN 32 |sgn0_pty_mont_384x| PROC hint #25 stp x29,x30,[sp,#-16*__SIZEOF_POINTER__]! add x29,sp,#0 stp x19,x20,[sp,#2*__SIZEOF_POINTER__] stp x21,x22,[sp,#4*__SIZEOF_POINTER__] stp x23,x24,[sp,#6*__SIZEOF_POINTER__] stp x25,x26,[sp,#8*__SIZEOF_POINTER__] stp x27,x28,[sp,#10*__SIZEOF_POINTER__] mov x4,x2 ldp x5,x6,[x1] ldp x7,x8,[x1,#16] ldp x9,x10,[x1,#32] mov x1,x0 bl __mul_by_1_mont_384 add x1,x1,#48 and x2,x11,#1 orr x3,x11,x12 adds x11,x11,x11 orr x3,x3,x13 adcs x12,x12,x12 orr x3,x3,x14 adcs x13,x13,x13 orr x3,x3,x15 adcs x14,x14,x14 orr x3,x3,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x2,x2,x17 bl __mul_by_1_mont_384 ldr x30,[x29,#__SIZEOF_POINTER__] and x0,x11,#1 orr x1,x11,x12 adds x11,x11,x11 orr x1,x1,x13 adcs x12,x12,x12 orr x1,x1,x14 adcs x13,x13,x13 orr x1,x1,x15 adcs x14,x14,x14 orr x1,x1,x16 adcs x15,x15,x15 adcs x16,x16,x16 adc x17,xzr,xzr subs x11,x11,x5 sbcs x12,x12,x6 sbcs x13,x13,x7 sbcs x14,x14,x8 sbcs x15,x15,x9 sbcs x16,x16,x10 sbc x17,x17,xzr mvn x17,x17 and x17,x17,#2 orr x0,x0,x17 cmp x3,#0 cseleq x3,x0,x2 cmp x1,#0 cselne x1,x0,x2 and x3,x3,#1 and x1,x1,#2 orr x0,x1,x3 ldp x19,x20,[x29,#2*__SIZEOF_POINTER__] ldp x21,x22,[x29,#4*__SIZEOF_POINTER__] ldp x23,x24,[x29,#6*__SIZEOF_POINTER__] ldp x25,x26,[x29,#8*__SIZEOF_POINTER__] ldp x27,x28,[x29,#10*__SIZEOF_POINTER__] ldr x29,[sp],#16*__SIZEOF_POINTER__ hint #29 ret ENDP END ================================================ FILE: build/win64/mulq_mont_256-x86_64.asm ================================================ OPTION DOTNAME EXTERN mul_mont_sparse_256$1:NEAR EXTERN sqr_mont_sparse_256$1:NEAR EXTERN from_mont_256$1:NEAR EXTERN redc_mont_256$1:NEAR _DATA SEGMENT COMM __blst_platform_cap:DWORD:1 _DATA ENDS .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC mul_mont_sparse_256 ALIGN 32 mul_mont_sparse_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_mont_sparse_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz mul_mont_sparse_256$1 endif push rbp push rbx push r12 push r13 push r14 push r15 push rdi $L$SEH_body_mul_mont_sparse_256:: mov rax,QWORD PTR[rdx] mov r13,QWORD PTR[rsi] mov r14,QWORD PTR[8+rsi] mov r12,QWORD PTR[16+rsi] mov rbp,QWORD PTR[24+rsi] mov rbx,rdx mov r15,rax mul r13 mov r9,rax mov rax,r15 mov r10,rdx call __mulq_mont_sparse_256 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_mul_mont_sparse_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_mont_sparse_256:: mul_mont_sparse_256 ENDP PUBLIC sqr_mont_sparse_256 ALIGN 32 sqr_mont_sparse_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_sparse_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_mont_sparse_256$1 endif push rbp push rbx push r12 push r13 push r14 push r15 push rdi $L$SEH_body_sqr_mont_sparse_256:: mov rax,QWORD PTR[rsi] mov r8,rcx mov r14,QWORD PTR[8+rsi] mov rcx,rdx mov r12,QWORD PTR[16+rsi] lea rbx,QWORD PTR[rsi] mov rbp,QWORD PTR[24+rsi] mov r15,rax mul rax mov r9,rax mov rax,r15 mov r10,rdx call __mulq_mont_sparse_256 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sqr_mont_sparse_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_mont_sparse_256:: sqr_mont_sparse_256 ENDP ALIGN 32 __mulq_mont_sparse_256 PROC PRIVATE DB 243,15,30,250 mul r14 add r10,rax mov rax,r15 adc rdx,0 mov r11,rdx mul r12 add r11,rax mov rax,r15 adc rdx,0 mov r12,rdx mul rbp add r12,rax mov rax,QWORD PTR[8+rbx] adc rdx,0 xor r14,r14 mov r13,rdx mov rdi,r9 imul r9,r8 mov r15,rax mul QWORD PTR[rsi] add r10,rax mov rax,r15 adc rdx,0 mov rbp,rdx mul QWORD PTR[8+rsi] add r11,rax mov rax,r15 adc rdx,0 add r11,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rsi] add r12,rax mov rax,r15 adc rdx,0 add r12,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rsi] add r13,rax mov rax,r9 adc rdx,0 add r13,rbp adc r14,rdx xor r15,r15 mul QWORD PTR[rcx] add rdi,rax mov rax,r9 adc rdi,rdx mul QWORD PTR[8+rcx] add r10,rax mov rax,r9 adc rdx,0 add r10,rdi adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r11,rax mov rax,r9 adc rdx,0 add r11,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r12,rax mov rax,QWORD PTR[16+rbx] adc rdx,0 add r12,rbp adc rdx,0 add r13,rdx adc r14,0 adc r15,0 mov rdi,r10 imul r10,r8 mov r9,rax mul QWORD PTR[rsi] add r11,rax mov rax,r9 adc rdx,0 mov rbp,rdx mul QWORD PTR[8+rsi] add r12,rax mov rax,r9 adc rdx,0 add r12,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rsi] add r13,rax mov rax,r9 adc rdx,0 add r13,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rsi] add r14,rax mov rax,r10 adc rdx,0 add r14,rbp adc r15,rdx xor r9,r9 mul QWORD PTR[rcx] add rdi,rax mov rax,r10 adc rdi,rdx mul QWORD PTR[8+rcx] add r11,rax mov rax,r10 adc rdx,0 add r11,rdi adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r12,rax mov rax,r10 adc rdx,0 add r12,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r13,rax mov rax,QWORD PTR[24+rbx] adc rdx,0 add r13,rbp adc rdx,0 add r14,rdx adc r15,0 adc r9,0 mov rdi,r11 imul r11,r8 mov r10,rax mul QWORD PTR[rsi] add r12,rax mov rax,r10 adc rdx,0 mov rbp,rdx mul QWORD PTR[8+rsi] add r13,rax mov rax,r10 adc rdx,0 add r13,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rsi] add r14,rax mov rax,r10 adc rdx,0 add r14,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rsi] add r15,rax mov rax,r11 adc rdx,0 add r15,rbp adc r9,rdx xor r10,r10 mul QWORD PTR[rcx] add rdi,rax mov rax,r11 adc rdi,rdx mul QWORD PTR[8+rcx] add r12,rax mov rax,r11 adc rdx,0 add r12,rdi adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r13,rax mov rax,r11 adc rdx,0 add r13,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r14,rax mov rax,r12 adc rdx,0 add r14,rbp adc rdx,0 add r15,rdx adc r9,0 adc r10,0 imul rax,r8 mov rsi,QWORD PTR[8+rsp] mov r11,rax mul QWORD PTR[rcx] add r12,rax mov rax,r11 adc r12,rdx mul QWORD PTR[8+rcx] add r13,rax mov rax,r11 adc rdx,0 add r13,r12 adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r14,rax mov rax,r11 adc rdx,0 add r14,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] mov rbx,r14 add r15,rbp adc rdx,0 add r15,rax mov rax,r13 adc rdx,0 add r9,rdx adc r10,0 mov r12,r15 sub r13,QWORD PTR[rcx] sbb r14,QWORD PTR[8+rcx] sbb r15,QWORD PTR[16+rcx] mov rbp,r9 sbb r9,QWORD PTR[24+rcx] sbb r10,0 cmovc r13,rax cmovc r14,rbx cmovc r15,r12 mov QWORD PTR[rsi],r13 cmovc r9,rbp mov QWORD PTR[8+rsi],r14 mov QWORD PTR[16+rsi],r15 mov QWORD PTR[24+rsi],r9 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulq_mont_sparse_256 ENDP PUBLIC from_mont_256 ALIGN 32 from_mont_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_from_mont_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz from_mont_256$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_from_mont_256:: mov rbx,rdx call __mulq_by_1_mont_256 mov r10,r14 mov r11,r15 mov r12,r9 sub r13,QWORD PTR[rbx] sbb r14,QWORD PTR[8+rbx] sbb r15,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] cmovnc rax,r13 cmovnc r10,r14 cmovnc r11,r15 mov QWORD PTR[rdi],rax cmovnc r12,r9 mov QWORD PTR[8+rdi],r10 mov QWORD PTR[16+rdi],r11 mov QWORD PTR[24+rdi],r12 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_from_mont_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_from_mont_256:: from_mont_256 ENDP PUBLIC redc_mont_256 ALIGN 32 redc_mont_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redc_mont_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz redc_mont_256$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_redc_mont_256:: mov rbx,rdx call __mulq_by_1_mont_256 add r13,QWORD PTR[32+rsi] adc r14,QWORD PTR[40+rsi] mov rax,r13 adc r15,QWORD PTR[48+rsi] mov r10,r14 adc r9,QWORD PTR[56+rsi] sbb rsi,rsi mov r11,r15 sub r13,QWORD PTR[rbx] sbb r14,QWORD PTR[8+rbx] sbb r15,QWORD PTR[16+rbx] mov r12,r9 sbb r9,QWORD PTR[24+rbx] sbb rsi,0 cmovnc rax,r13 cmovnc r10,r14 cmovnc r11,r15 mov QWORD PTR[rdi],rax cmovnc r12,r9 mov QWORD PTR[8+rdi],r10 mov QWORD PTR[16+rdi],r11 mov QWORD PTR[24+rdi],r12 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_redc_mont_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_redc_mont_256:: redc_mont_256 ENDP ALIGN 32 __mulq_by_1_mont_256 PROC PRIVATE DB 243,15,30,250 mov rax,QWORD PTR[rsi] mov r10,QWORD PTR[8+rsi] mov r11,QWORD PTR[16+rsi] mov r12,QWORD PTR[24+rsi] mov r13,rax imul rax,rcx mov r9,rax mul QWORD PTR[rbx] add r13,rax mov rax,r9 adc r13,rdx mul QWORD PTR[8+rbx] add r10,rax mov rax,r9 adc rdx,0 add r10,r13 adc rdx,0 mov r13,rdx mul QWORD PTR[16+rbx] mov r14,r10 imul r10,rcx add r11,rax mov rax,r9 adc rdx,0 add r11,r13 adc rdx,0 mov r13,rdx mul QWORD PTR[24+rbx] add r12,rax mov rax,r10 adc rdx,0 add r12,r13 adc rdx,0 mov r13,rdx mul QWORD PTR[rbx] add r14,rax mov rax,r10 adc r14,rdx mul QWORD PTR[8+rbx] add r11,rax mov rax,r10 adc rdx,0 add r11,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[16+rbx] mov r15,r11 imul r11,rcx add r12,rax mov rax,r10 adc rdx,0 add r12,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[24+rbx] add r13,rax mov rax,r11 adc rdx,0 add r13,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[rbx] add r15,rax mov rax,r11 adc r15,rdx mul QWORD PTR[8+rbx] add r12,rax mov rax,r11 adc rdx,0 add r12,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[16+rbx] mov r9,r12 imul r12,rcx add r13,rax mov rax,r11 adc rdx,0 add r13,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[24+rbx] add r14,rax mov rax,r12 adc rdx,0 add r14,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[rbx] add r9,rax mov rax,r12 adc r9,rdx mul QWORD PTR[8+rbx] add r13,rax mov rax,r12 adc rdx,0 add r13,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[16+rbx] add r14,rax mov rax,r12 adc rdx,0 add r14,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rbx] add r15,rax mov rax,r13 adc rdx,0 add r15,r9 adc rdx,0 mov r9,rdx ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulq_by_1_mont_256 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_mul_mont_sparse_256 DD imagerel $L$SEH_body_mul_mont_sparse_256 DD imagerel $L$SEH_info_mul_mont_sparse_256_prologue DD imagerel $L$SEH_body_mul_mont_sparse_256 DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 DD imagerel $L$SEH_info_mul_mont_sparse_256_body DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 DD imagerel $L$SEH_end_mul_mont_sparse_256 DD imagerel $L$SEH_info_mul_mont_sparse_256_epilogue DD imagerel $L$SEH_begin_sqr_mont_sparse_256 DD imagerel $L$SEH_body_sqr_mont_sparse_256 DD imagerel $L$SEH_info_sqr_mont_sparse_256_prologue DD imagerel $L$SEH_body_sqr_mont_sparse_256 DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 DD imagerel $L$SEH_info_sqr_mont_sparse_256_body DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 DD imagerel $L$SEH_end_sqr_mont_sparse_256 DD imagerel $L$SEH_info_sqr_mont_sparse_256_epilogue DD imagerel $L$SEH_begin_from_mont_256 DD imagerel $L$SEH_body_from_mont_256 DD imagerel $L$SEH_info_from_mont_256_prologue DD imagerel $L$SEH_body_from_mont_256 DD imagerel $L$SEH_epilogue_from_mont_256 DD imagerel $L$SEH_info_from_mont_256_body DD imagerel $L$SEH_epilogue_from_mont_256 DD imagerel $L$SEH_end_from_mont_256 DD imagerel $L$SEH_info_from_mont_256_epilogue DD imagerel $L$SEH_begin_redc_mont_256 DD imagerel $L$SEH_body_redc_mont_256 DD imagerel $L$SEH_info_redc_mont_256_prologue DD imagerel $L$SEH_body_redc_mont_256 DD imagerel $L$SEH_epilogue_redc_mont_256 DD imagerel $L$SEH_info_redc_mont_256_body DD imagerel $L$SEH_epilogue_redc_mont_256 DD imagerel $L$SEH_end_redc_mont_256 DD imagerel $L$SEH_info_redc_mont_256_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_mul_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_from_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_from_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_from_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_redc_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_redc_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_redc_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/mulq_mont_384-x86_64.asm ================================================ OPTION DOTNAME EXTERN mul_mont_384x$1:NEAR EXTERN sqr_mont_384x$1:NEAR EXTERN mul_382x$1:NEAR EXTERN sqr_382x$1:NEAR EXTERN mul_384$1:NEAR EXTERN sqr_384$1:NEAR EXTERN redc_mont_384$1:NEAR EXTERN from_mont_384$1:NEAR EXTERN sgn0_pty_mont_384$1:NEAR EXTERN sgn0_pty_mont_384x$1:NEAR EXTERN mul_mont_384$1:NEAR EXTERN sqr_mont_384$1:NEAR EXTERN sqr_n_mul_mont_384$1:NEAR EXTERN sqr_n_mul_mont_383$1:NEAR EXTERN sqr_mont_382x$1:NEAR _DATA SEGMENT COMM __blst_platform_cap:DWORD:1 _DATA ENDS .text$ SEGMENT ALIGN(256) 'CODE' ALIGN 32 __subq_mod_384x384 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] sub r8,QWORD PTR[rdx] mov r15,QWORD PTR[56+rsi] sbb r9,QWORD PTR[8+rdx] mov rax,QWORD PTR[64+rsi] sbb r10,QWORD PTR[16+rdx] mov rbx,QWORD PTR[72+rsi] sbb r11,QWORD PTR[24+rdx] mov rbp,QWORD PTR[80+rsi] sbb r12,QWORD PTR[32+rdx] mov rsi,QWORD PTR[88+rsi] sbb r13,QWORD PTR[40+rdx] mov QWORD PTR[rdi],r8 sbb r14,QWORD PTR[48+rdx] mov r8,QWORD PTR[rcx] mov QWORD PTR[8+rdi],r9 sbb r15,QWORD PTR[56+rdx] mov r9,QWORD PTR[8+rcx] mov QWORD PTR[16+rdi],r10 sbb rax,QWORD PTR[64+rdx] mov r10,QWORD PTR[16+rcx] mov QWORD PTR[24+rdi],r11 sbb rbx,QWORD PTR[72+rdx] mov r11,QWORD PTR[24+rcx] mov QWORD PTR[32+rdi],r12 sbb rbp,QWORD PTR[80+rdx] mov r12,QWORD PTR[32+rcx] mov QWORD PTR[40+rdi],r13 sbb rsi,QWORD PTR[88+rdx] mov r13,QWORD PTR[40+rcx] sbb rdx,rdx and r8,rdx and r9,rdx and r10,rdx and r11,rdx and r12,rdx and r13,rdx add r14,r8 adc r15,r9 mov QWORD PTR[48+rdi],r14 adc rax,r10 mov QWORD PTR[56+rdi],r15 adc rbx,r11 mov QWORD PTR[64+rdi],rax adc rbp,r12 mov QWORD PTR[72+rdi],rbx adc rsi,r13 mov QWORD PTR[80+rdi],rbp mov QWORD PTR[88+rdi],rsi ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __subq_mod_384x384 ENDP ALIGN 32 __addq_mod_384 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] add r8,QWORD PTR[rdx] adc r9,QWORD PTR[8+rdx] adc r10,QWORD PTR[16+rdx] mov r14,r8 adc r11,QWORD PTR[24+rdx] mov r15,r9 adc r12,QWORD PTR[32+rdx] mov rax,r10 adc r13,QWORD PTR[40+rdx] mov rbx,r11 sbb rdx,rdx sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] mov rbp,r12 sbb r10,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rcx] mov rsi,r13 sbb r13,QWORD PTR[40+rcx] sbb rdx,0 cmovc r8,r14 cmovc r9,r15 cmovc r10,rax mov QWORD PTR[rdi],r8 cmovc r11,rbx mov QWORD PTR[8+rdi],r9 cmovc r12,rbp mov QWORD PTR[16+rdi],r10 cmovc r13,rsi mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __addq_mod_384 ENDP ALIGN 32 __subq_mod_384 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] __subq_mod_384_a_is_loaded:: sub r8,QWORD PTR[rdx] mov r14,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rdx] mov r15,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rdx] mov rax,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rdx] mov rbx,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rdx] mov rbp,QWORD PTR[32+rcx] sbb r13,QWORD PTR[40+rdx] mov rsi,QWORD PTR[40+rcx] sbb rdx,rdx and r14,rdx and r15,rdx and rax,rdx and rbx,rdx and rbp,rdx and rsi,rdx add r8,r14 adc r9,r15 mov QWORD PTR[rdi],r8 adc r10,rax mov QWORD PTR[8+rdi],r9 adc r11,rbx mov QWORD PTR[16+rdi],r10 adc r12,rbp mov QWORD PTR[24+rdi],r11 adc r13,rsi mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __subq_mod_384 ENDP PUBLIC mul_mont_384x ALIGN 32 mul_mont_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_mont_384x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz mul_mont_384x$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,328 $L$SEH_body_mul_mont_384x:: mov rbx,rdx mov QWORD PTR[32+rsp],rdi mov QWORD PTR[24+rsp],rsi mov QWORD PTR[16+rsp],rdx mov QWORD PTR[8+rsp],rcx mov QWORD PTR[rsp],r8 lea rdi,QWORD PTR[40+rsp] call __mulq_384 lea rbx,QWORD PTR[48+rbx] lea rsi,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((40+96))+rsp] call __mulq_384 mov rcx,QWORD PTR[8+rsp] lea rdx,QWORD PTR[((-48))+rsi] lea rdi,QWORD PTR[((40+192+48))+rsp] call __addq_mod_384 mov rsi,QWORD PTR[16+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((-48))+rdi] call __addq_mod_384 lea rbx,QWORD PTR[rdi] lea rsi,QWORD PTR[48+rdi] call __mulq_384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[40+rsp] mov rcx,QWORD PTR[8+rsp] call __subq_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] call __subq_mod_384x384 lea rsi,QWORD PTR[40+rsp] lea rdx,QWORD PTR[((40+96))+rsp] lea rdi,QWORD PTR[40+rsp] call __subq_mod_384x384 mov rbx,rcx lea rsi,QWORD PTR[40+rsp] mov rcx,QWORD PTR[rsp] mov rdi,QWORD PTR[32+rsp] call __mulq_by_1_mont_384 call __redq_tail_mont_384 lea rsi,QWORD PTR[((40+192))+rsp] mov rcx,QWORD PTR[rsp] lea rdi,QWORD PTR[48+rdi] call __mulq_by_1_mont_384 call __redq_tail_mont_384 lea r8,QWORD PTR[328+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_mul_mont_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_mont_384x:: mul_mont_384x ENDP PUBLIC sqr_mont_384x ALIGN 32 sqr_mont_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_384x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_mont_384x$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,136 $L$SEH_body_sqr_mont_384x:: mov QWORD PTR[rsp],rcx mov rcx,rdx mov QWORD PTR[8+rsp],rdi mov QWORD PTR[16+rsp],rsi lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[32+rsp] call __addq_mod_384 mov rsi,QWORD PTR[16+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((32+48))+rsp] call __subq_mod_384 mov rsi,QWORD PTR[16+rsp] lea rbx,QWORD PTR[48+rsi] mov rax,QWORD PTR[48+rsi] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov r12,QWORD PTR[16+rsi] mov r13,QWORD PTR[24+rsi] call __mulq_mont_384 add r14,r14 adc r15,r15 adc r8,r8 mov r12,r14 adc r9,r9 mov r13,r15 adc r10,r10 mov rax,r8 adc r11,r11 mov rbx,r9 sbb rdx,rdx sub r14,QWORD PTR[rcx] sbb r15,QWORD PTR[8+rcx] mov rbp,r10 sbb r8,QWORD PTR[16+rcx] sbb r9,QWORD PTR[24+rcx] sbb r10,QWORD PTR[32+rcx] mov rsi,r11 sbb r11,QWORD PTR[40+rcx] sbb rdx,0 cmovc r14,r12 cmovc r15,r13 cmovc r8,rax mov QWORD PTR[48+rdi],r14 cmovc r9,rbx mov QWORD PTR[56+rdi],r15 cmovc r10,rbp mov QWORD PTR[64+rdi],r8 cmovc r11,rsi mov QWORD PTR[72+rdi],r9 mov QWORD PTR[80+rdi],r10 mov QWORD PTR[88+rdi],r11 lea rsi,QWORD PTR[32+rsp] lea rbx,QWORD PTR[((32+48))+rsp] mov rax,QWORD PTR[((32+48))+rsp] mov r14,QWORD PTR[((32+0))+rsp] mov r15,QWORD PTR[((32+8))+rsp] mov r12,QWORD PTR[((32+16))+rsp] mov r13,QWORD PTR[((32+24))+rsp] call __mulq_mont_384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_sqr_mont_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_mont_384x:: sqr_mont_384x ENDP PUBLIC mul_382x ALIGN 32 mul_382x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_382x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz mul_382x$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,136 $L$SEH_body_mul_382x:: lea rdi,QWORD PTR[96+rdi] mov QWORD PTR[rsp],rsi mov QWORD PTR[8+rsp],rdx mov QWORD PTR[16+rsp],rdi mov QWORD PTR[24+rsp],rcx mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] add r8,QWORD PTR[48+rsi] adc r9,QWORD PTR[56+rsi] adc r10,QWORD PTR[64+rsi] adc r11,QWORD PTR[72+rsi] adc r12,QWORD PTR[80+rsi] adc r13,QWORD PTR[88+rsi] mov QWORD PTR[((32+0))+rsp],r8 mov QWORD PTR[((32+8))+rsp],r9 mov QWORD PTR[((32+16))+rsp],r10 mov QWORD PTR[((32+24))+rsp],r11 mov QWORD PTR[((32+32))+rsp],r12 mov QWORD PTR[((32+40))+rsp],r13 mov r8,QWORD PTR[rdx] mov r9,QWORD PTR[8+rdx] mov r10,QWORD PTR[16+rdx] mov r11,QWORD PTR[24+rdx] mov r12,QWORD PTR[32+rdx] mov r13,QWORD PTR[40+rdx] add r8,QWORD PTR[48+rdx] adc r9,QWORD PTR[56+rdx] adc r10,QWORD PTR[64+rdx] adc r11,QWORD PTR[72+rdx] adc r12,QWORD PTR[80+rdx] adc r13,QWORD PTR[88+rdx] mov QWORD PTR[((32+48))+rsp],r8 mov QWORD PTR[((32+56))+rsp],r9 mov QWORD PTR[((32+64))+rsp],r10 mov QWORD PTR[((32+72))+rsp],r11 mov QWORD PTR[((32+80))+rsp],r12 mov QWORD PTR[((32+88))+rsp],r13 lea rsi,QWORD PTR[((32+0))+rsp] lea rbx,QWORD PTR[((32+48))+rsp] call __mulq_384 mov rsi,QWORD PTR[rsp] mov rbx,QWORD PTR[8+rsp] lea rdi,QWORD PTR[((-96))+rdi] call __mulq_384 lea rsi,QWORD PTR[48+rsi] lea rbx,QWORD PTR[48+rbx] lea rdi,QWORD PTR[32+rsp] call __mulq_384 mov rsi,QWORD PTR[16+rsp] lea rdx,QWORD PTR[32+rsp] mov rcx,QWORD PTR[24+rsp] mov rdi,rsi call __subq_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] call __subq_mod_384x384 lea rsi,QWORD PTR[((-96))+rdi] lea rdx,QWORD PTR[32+rsp] lea rdi,QWORD PTR[((-96))+rdi] call __subq_mod_384x384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_mul_382x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_382x:: mul_382x ENDP PUBLIC sqr_382x ALIGN 32 sqr_382x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_382x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_382x$1 endif push rbp push rbx push r12 push r13 push r14 push r15 push rsi $L$SEH_body_sqr_382x:: mov rcx,rdx mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov rbx,QWORD PTR[24+rsi] mov rbp,QWORD PTR[32+rsi] mov rdx,QWORD PTR[40+rsi] mov r8,r14 add r14,QWORD PTR[48+rsi] mov r9,r15 adc r15,QWORD PTR[56+rsi] mov r10,rax adc rax,QWORD PTR[64+rsi] mov r11,rbx adc rbx,QWORD PTR[72+rsi] mov r12,rbp adc rbp,QWORD PTR[80+rsi] mov r13,rdx adc rdx,QWORD PTR[88+rsi] mov QWORD PTR[rdi],r14 mov QWORD PTR[8+rdi],r15 mov QWORD PTR[16+rdi],rax mov QWORD PTR[24+rdi],rbx mov QWORD PTR[32+rdi],rbp mov QWORD PTR[40+rdi],rdx lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[48+rdi] call __subq_mod_384_a_is_loaded lea rsi,QWORD PTR[rdi] lea rbx,QWORD PTR[((-48))+rdi] lea rdi,QWORD PTR[((-48))+rdi] call __mulq_384 mov rsi,QWORD PTR[rsp] lea rbx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[96+rdi] call __mulq_384 mov r8,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] mov r10,QWORD PTR[16+rdi] mov r11,QWORD PTR[24+rdi] mov r12,QWORD PTR[32+rdi] mov r13,QWORD PTR[40+rdi] mov r14,QWORD PTR[48+rdi] mov r15,QWORD PTR[56+rdi] mov rax,QWORD PTR[64+rdi] mov rbx,QWORD PTR[72+rdi] mov rbp,QWORD PTR[80+rdi] add r8,r8 mov rdx,QWORD PTR[88+rdi] adc r9,r9 mov QWORD PTR[rdi],r8 adc r10,r10 mov QWORD PTR[8+rdi],r9 adc r11,r11 mov QWORD PTR[16+rdi],r10 adc r12,r12 mov QWORD PTR[24+rdi],r11 adc r13,r13 mov QWORD PTR[32+rdi],r12 adc r14,r14 mov QWORD PTR[40+rdi],r13 adc r15,r15 mov QWORD PTR[48+rdi],r14 adc rax,rax mov QWORD PTR[56+rdi],r15 adc rbx,rbx mov QWORD PTR[64+rdi],rax adc rbp,rbp mov QWORD PTR[72+rdi],rbx adc rdx,rdx mov QWORD PTR[80+rdi],rbp mov QWORD PTR[88+rdi],rdx mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sqr_382x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_382x:: sqr_382x ENDP PUBLIC mul_384 ALIGN 32 mul_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz mul_384$1 endif push rbp push rbx push r12 $L$SEH_body_mul_384:: mov rbx,rdx call __mulq_384 mov r12,QWORD PTR[rsp] mov rbx,QWORD PTR[8+rsp] mov rbp,QWORD PTR[16+rsp] lea rsp,QWORD PTR[24+rsp] $L$SEH_epilogue_mul_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_384:: mul_384 ENDP ALIGN 32 __mulq_384 PROC PRIVATE DB 243,15,30,250 mov rax,QWORD PTR[rbx] mov rbp,rax mul QWORD PTR[rsi] mov QWORD PTR[rdi],rax mov rax,rbp mov rcx,rdx mul QWORD PTR[8+rsi] add rcx,rax mov rax,rbp adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r8,rax mov rax,rbp adc rdx,0 mov r9,rdx mul QWORD PTR[24+rsi] add r9,rax mov rax,rbp adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r10,rax mov rax,rbp adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r11,rax mov rax,QWORD PTR[8+rbx] adc rdx,0 mov r12,rdx mov rbp,rax mul QWORD PTR[rsi] add rcx,rax mov rax,rbp adc rdx,0 mov QWORD PTR[8+rdi],rcx mov rcx,rdx mul QWORD PTR[8+rsi] add r8,rax mov rax,rbp adc rdx,0 add rcx,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r9,rax mov rax,rbp adc rdx,0 add r8,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rsi] add r10,rax mov rax,rbp adc rdx,0 add r9,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r11,rax mov rax,rbp adc rdx,0 add r10,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r12,rax mov rax,QWORD PTR[16+rbx] adc rdx,0 add r11,r12 adc rdx,0 mov r12,rdx mov rbp,rax mul QWORD PTR[rsi] add rcx,rax mov rax,rbp adc rdx,0 mov QWORD PTR[16+rdi],rcx mov rcx,rdx mul QWORD PTR[8+rsi] add r8,rax mov rax,rbp adc rdx,0 add rcx,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r9,rax mov rax,rbp adc rdx,0 add r8,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rsi] add r10,rax mov rax,rbp adc rdx,0 add r9,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r11,rax mov rax,rbp adc rdx,0 add r10,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r12,rax mov rax,QWORD PTR[24+rbx] adc rdx,0 add r11,r12 adc rdx,0 mov r12,rdx mov rbp,rax mul QWORD PTR[rsi] add rcx,rax mov rax,rbp adc rdx,0 mov QWORD PTR[24+rdi],rcx mov rcx,rdx mul QWORD PTR[8+rsi] add r8,rax mov rax,rbp adc rdx,0 add rcx,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r9,rax mov rax,rbp adc rdx,0 add r8,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rsi] add r10,rax mov rax,rbp adc rdx,0 add r9,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r11,rax mov rax,rbp adc rdx,0 add r10,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r12,rax mov rax,QWORD PTR[32+rbx] adc rdx,0 add r11,r12 adc rdx,0 mov r12,rdx mov rbp,rax mul QWORD PTR[rsi] add rcx,rax mov rax,rbp adc rdx,0 mov QWORD PTR[32+rdi],rcx mov rcx,rdx mul QWORD PTR[8+rsi] add r8,rax mov rax,rbp adc rdx,0 add rcx,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r9,rax mov rax,rbp adc rdx,0 add r8,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rsi] add r10,rax mov rax,rbp adc rdx,0 add r9,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r11,rax mov rax,rbp adc rdx,0 add r10,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r12,rax mov rax,QWORD PTR[40+rbx] adc rdx,0 add r11,r12 adc rdx,0 mov r12,rdx mov rbp,rax mul QWORD PTR[rsi] add rcx,rax mov rax,rbp adc rdx,0 mov QWORD PTR[40+rdi],rcx mov rcx,rdx mul QWORD PTR[8+rsi] add r8,rax mov rax,rbp adc rdx,0 add rcx,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r9,rax mov rax,rbp adc rdx,0 add r8,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rsi] add r10,rax mov rax,rbp adc rdx,0 add r9,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r11,rax mov rax,rbp adc rdx,0 add r10,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r12,rax mov rax,rax adc rdx,0 add r11,r12 adc rdx,0 mov r12,rdx mov QWORD PTR[48+rdi],rcx mov QWORD PTR[56+rdi],r8 mov QWORD PTR[64+rdi],r9 mov QWORD PTR[72+rdi],r10 mov QWORD PTR[80+rdi],r11 mov QWORD PTR[88+rdi],r12 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulq_384 ENDP PUBLIC sqr_384 ALIGN 32 sqr_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_384:: mov rdi,rcx mov rsi,rdx ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sqr_384:: call __sqrq_384 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sqr_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_384:: sqr_384 ENDP ALIGN 32 __sqrq_384 PROC PRIVATE DB 243,15,30,250 mov rax,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rcx,QWORD PTR[16+rsi] mov rbx,QWORD PTR[24+rsi] mov r14,rax mul r15 mov r9,rax mov rax,r14 mov rbp,QWORD PTR[32+rsi] mov r10,rdx mul rcx add r10,rax mov rax,r14 adc rdx,0 mov rsi,QWORD PTR[40+rsi] mov r11,rdx mul rbx add r11,rax mov rax,r14 adc rdx,0 mov r12,rdx mul rbp add r12,rax mov rax,r14 adc rdx,0 mov r13,rdx mul rsi add r13,rax mov rax,r14 adc rdx,0 mov r14,rdx mul rax xor r8,r8 mov QWORD PTR[rdi],rax mov rax,r15 add r9,r9 adc r8,0 add r9,rdx adc r8,0 mov QWORD PTR[8+rdi],r9 mul rcx add r11,rax mov rax,r15 adc rdx,0 mov r9,rdx mul rbx add r12,rax mov rax,r15 adc rdx,0 add r12,r9 adc rdx,0 mov r9,rdx mul rbp add r13,rax mov rax,r15 adc rdx,0 add r13,r9 adc rdx,0 mov r9,rdx mul rsi add r14,rax mov rax,r15 adc rdx,0 add r14,r9 adc rdx,0 mov r15,rdx mul rax xor r9,r9 add r8,rax mov rax,rcx add r10,r10 adc r11,r11 adc r9,0 add r10,r8 adc r11,rdx adc r9,0 mov QWORD PTR[16+rdi],r10 mul rbx add r13,rax mov rax,rcx adc rdx,0 mov QWORD PTR[24+rdi],r11 mov r8,rdx mul rbp add r14,rax mov rax,rcx adc rdx,0 add r14,r8 adc rdx,0 mov r8,rdx mul rsi add r15,rax mov rax,rcx adc rdx,0 add r15,r8 adc rdx,0 mov rcx,rdx mul rax xor r11,r11 add r9,rax mov rax,rbx add r12,r12 adc r13,r13 adc r11,0 add r12,r9 adc r13,rdx adc r11,0 mov QWORD PTR[32+rdi],r12 mul rbp add r15,rax mov rax,rbx adc rdx,0 mov QWORD PTR[40+rdi],r13 mov r8,rdx mul rsi add rcx,rax mov rax,rbx adc rdx,0 add rcx,r8 adc rdx,0 mov rbx,rdx mul rax xor r12,r12 add r11,rax mov rax,rbp add r14,r14 adc r15,r15 adc r12,0 add r14,r11 adc r15,rdx mov QWORD PTR[48+rdi],r14 adc r12,0 mov QWORD PTR[56+rdi],r15 mul rsi add rbx,rax mov rax,rbp adc rdx,0 mov rbp,rdx mul rax xor r13,r13 add r12,rax mov rax,rsi add rcx,rcx adc rbx,rbx adc r13,0 add rcx,r12 adc rbx,rdx mov QWORD PTR[64+rdi],rcx adc r13,0 mov QWORD PTR[72+rdi],rbx mul rax add rax,r13 add rbp,rbp adc rdx,0 add rax,rbp adc rdx,0 mov QWORD PTR[80+rdi],rax mov QWORD PTR[88+rdi],rdx ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __sqrq_384 ENDP PUBLIC sqr_mont_384 ALIGN 32 sqr_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_mont_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8*15 $L$SEH_body_sqr_mont_384:: mov QWORD PTR[96+rsp],rcx mov QWORD PTR[104+rsp],rdx mov QWORD PTR[112+rsp],rdi mov rdi,rsp call __sqrq_384 lea rsi,QWORD PTR[rsp] mov rcx,QWORD PTR[96+rsp] mov rbx,QWORD PTR[104+rsp] mov rdi,QWORD PTR[112+rsp] call __mulq_by_1_mont_384 call __redq_tail_mont_384 lea r8,QWORD PTR[120+rsp] mov r15,QWORD PTR[120+rsp] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_sqr_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_mont_384:: sqr_mont_384 ENDP PUBLIC redc_mont_384 ALIGN 32 redc_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redc_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz redc_mont_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_redc_mont_384:: mov rbx,rdx call __mulq_by_1_mont_384 call __redq_tail_mont_384 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_redc_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_redc_mont_384:: redc_mont_384 ENDP PUBLIC from_mont_384 ALIGN 32 from_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_from_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz from_mont_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_from_mont_384:: mov rbx,rdx call __mulq_by_1_mont_384 mov rcx,r15 mov rdx,r8 mov rbp,r9 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] mov r13,r10 sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] mov rsi,r11 sbb r11,QWORD PTR[40+rbx] cmovc r14,rax cmovc r15,rcx cmovc r8,rdx mov QWORD PTR[rdi],r14 cmovc r9,rbp mov QWORD PTR[8+rdi],r15 cmovc r10,r13 mov QWORD PTR[16+rdi],r8 cmovc r11,rsi mov QWORD PTR[24+rdi],r9 mov QWORD PTR[32+rdi],r10 mov QWORD PTR[40+rdi],r11 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_from_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_from_mont_384:: from_mont_384 ENDP ALIGN 32 __mulq_by_1_mont_384 PROC PRIVATE DB 243,15,30,250 mov rax,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,rax imul rax,rcx mov r8,rax mul QWORD PTR[rbx] add r14,rax mov rax,r8 adc r14,rdx mul QWORD PTR[8+rbx] add r9,rax mov rax,r8 adc rdx,0 add r9,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[16+rbx] add r10,rax mov rax,r8 adc rdx,0 add r10,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[24+rbx] add r11,rax mov rax,r8 adc rdx,0 mov r15,r9 imul r9,rcx add r11,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[32+rbx] add r12,rax mov rax,r8 adc rdx,0 add r12,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[40+rbx] add r13,rax mov rax,r9 adc rdx,0 add r13,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[rbx] add r15,rax mov rax,r9 adc r15,rdx mul QWORD PTR[8+rbx] add r10,rax mov rax,r9 adc rdx,0 add r10,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[16+rbx] add r11,rax mov rax,r9 adc rdx,0 add r11,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[24+rbx] add r12,rax mov rax,r9 adc rdx,0 mov r8,r10 imul r10,rcx add r12,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[32+rbx] add r13,rax mov rax,r9 adc rdx,0 add r13,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[40+rbx] add r14,rax mov rax,r10 adc rdx,0 add r14,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[rbx] add r8,rax mov rax,r10 adc r8,rdx mul QWORD PTR[8+rbx] add r11,rax mov rax,r10 adc rdx,0 add r11,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rbx] add r12,rax mov rax,r10 adc rdx,0 add r12,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[24+rbx] add r13,rax mov rax,r10 adc rdx,0 mov r9,r11 imul r11,rcx add r13,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[32+rbx] add r14,rax mov rax,r10 adc rdx,0 add r14,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[40+rbx] add r15,rax mov rax,r11 adc rdx,0 add r15,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[rbx] add r9,rax mov rax,r11 adc r9,rdx mul QWORD PTR[8+rbx] add r12,rax mov rax,r11 adc rdx,0 add r12,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[16+rbx] add r13,rax mov rax,r11 adc rdx,0 add r13,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rbx] add r14,rax mov rax,r11 adc rdx,0 mov r10,r12 imul r12,rcx add r14,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[32+rbx] add r15,rax mov rax,r11 adc rdx,0 add r15,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[40+rbx] add r8,rax mov rax,r12 adc rdx,0 add r8,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[rbx] add r10,rax mov rax,r12 adc r10,rdx mul QWORD PTR[8+rbx] add r13,rax mov rax,r12 adc rdx,0 add r13,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[16+rbx] add r14,rax mov rax,r12 adc rdx,0 add r14,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[24+rbx] add r15,rax mov rax,r12 adc rdx,0 mov r11,r13 imul r13,rcx add r15,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rbx] add r8,rax mov rax,r12 adc rdx,0 add r8,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[40+rbx] add r9,rax mov rax,r13 adc rdx,0 add r9,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[rbx] add r11,rax mov rax,r13 adc r11,rdx mul QWORD PTR[8+rbx] add r14,rax mov rax,r13 adc rdx,0 add r14,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[16+rbx] add r15,rax mov rax,r13 adc rdx,0 add r15,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[24+rbx] add r8,rax mov rax,r13 adc rdx,0 add r8,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[32+rbx] add r9,rax mov rax,r13 adc rdx,0 add r9,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rbx] add r10,rax mov rax,r14 adc rdx,0 add r10,r11 adc rdx,0 mov r11,rdx ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulq_by_1_mont_384 ENDP ALIGN 32 __redq_tail_mont_384 PROC PRIVATE DB 243,15,30,250 add r14,QWORD PTR[48+rsi] mov rax,r14 adc r15,QWORD PTR[56+rsi] adc r8,QWORD PTR[64+rsi] adc r9,QWORD PTR[72+rsi] mov rcx,r15 adc r10,QWORD PTR[80+rsi] adc r11,QWORD PTR[88+rsi] sbb r12,r12 mov rdx,r8 mov rbp,r9 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] mov r13,r10 sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] mov rsi,r11 sbb r11,QWORD PTR[40+rbx] sbb r12,0 cmovc r14,rax cmovc r15,rcx cmovc r8,rdx mov QWORD PTR[rdi],r14 cmovc r9,rbp mov QWORD PTR[8+rdi],r15 cmovc r10,r13 mov QWORD PTR[16+rdi],r8 cmovc r11,rsi mov QWORD PTR[24+rdi],r9 mov QWORD PTR[32+rdi],r10 mov QWORD PTR[40+rdi],r11 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __redq_tail_mont_384 ENDP PUBLIC sgn0_pty_mont_384 ALIGN 32 sgn0_pty_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sgn0_pty_mont_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sgn0_pty_mont_384:: mov rbx,rsi lea rsi,QWORD PTR[rdi] mov rcx,rdx call __mulq_by_1_mont_384 xor rax,rax mov r13,r14 add r14,r14 adc r15,r15 adc r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 adc rax,0 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] sbb r11,QWORD PTR[40+rbx] sbb rax,0 not rax and r13,1 and rax,2 or rax,r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sgn0_pty_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sgn0_pty_mont_384:: sgn0_pty_mont_384 ENDP PUBLIC sgn0_pty_mont_384x ALIGN 32 sgn0_pty_mont_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mont_384x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sgn0_pty_mont_384x$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sgn0_pty_mont_384x:: mov rbx,rsi lea rsi,QWORD PTR[48+rdi] mov rcx,rdx call __mulq_by_1_mont_384 mov r12,r14 or r14,r15 or r14,r8 or r14,r9 or r14,r10 or r14,r11 lea rsi,QWORD PTR[rdi] xor rdi,rdi mov r13,r12 add r12,r12 adc r15,r15 adc r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 adc rdi,0 sub r12,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] sbb r11,QWORD PTR[40+rbx] sbb rdi,0 mov QWORD PTR[rsp],r14 not rdi and r13,1 and rdi,2 or rdi,r13 call __mulq_by_1_mont_384 mov r12,r14 or r14,r15 or r14,r8 or r14,r9 or r14,r10 or r14,r11 xor rax,rax mov r13,r12 add r12,r12 adc r15,r15 adc r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 adc rax,0 sub r12,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] sbb r11,QWORD PTR[40+rbx] sbb rax,0 mov r12,QWORD PTR[rsp] not rax test r14,r14 cmovz r13,rdi test r12,r12 cmovnz rax,rdi and r13,1 and rax,2 or rax,r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sgn0_pty_mont_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sgn0_pty_mont_384x:: sgn0_pty_mont_384x ENDP PUBLIC mul_mont_384 ALIGN 32 mul_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz mul_mont_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8*3 $L$SEH_body_mul_mont_384:: mov rax,QWORD PTR[rdx] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov r12,QWORD PTR[16+rsi] mov r13,QWORD PTR[24+rsi] mov rbx,rdx mov QWORD PTR[rsp],r8 mov QWORD PTR[8+rsp],rdi call __mulq_mont_384 mov r15,QWORD PTR[24+rsp] mov r14,QWORD PTR[32+rsp] mov r13,QWORD PTR[40+rsp] mov r12,QWORD PTR[48+rsp] mov rbx,QWORD PTR[56+rsp] mov rbp,QWORD PTR[64+rsp] lea rsp,QWORD PTR[72+rsp] $L$SEH_epilogue_mul_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mul_mont_384:: mul_mont_384 ENDP ALIGN 32 __mulq_mont_384 PROC PRIVATE DB 243,15,30,250 mov rdi,rax mul r14 mov r8,rax mov rax,rdi mov r9,rdx mul r15 add r9,rax mov rax,rdi adc rdx,0 mov r10,rdx mul r12 add r10,rax mov rax,rdi adc rdx,0 mov r11,rdx mov rbp,r8 imul r8,QWORD PTR[8+rsp] mul r13 add r11,rax mov rax,rdi adc rdx,0 mov r12,rdx mul QWORD PTR[32+rsi] add r12,rax mov rax,rdi adc rdx,0 mov r13,rdx mul QWORD PTR[40+rsi] add r13,rax mov rax,r8 adc rdx,0 xor r15,r15 mov r14,rdx mul QWORD PTR[rcx] add rbp,rax mov rax,r8 adc rbp,rdx mul QWORD PTR[8+rcx] add r9,rax mov rax,r8 adc rdx,0 add r9,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r10,rax mov rax,r8 adc rdx,0 add r10,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r11,rbp adc rdx,0 add r11,rax mov rax,r8 adc rdx,0 mov rbp,rdx mul QWORD PTR[32+rcx] add r12,rax mov rax,r8 adc rdx,0 add r12,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[40+rcx] add r13,rax mov rax,QWORD PTR[8+rbx] adc rdx,0 add r13,rbp adc r14,rdx adc r15,0 mov rdi,rax mul QWORD PTR[rsi] add r9,rax mov rax,rdi adc rdx,0 mov r8,rdx mul QWORD PTR[8+rsi] add r10,rax mov rax,rdi adc rdx,0 add r10,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r11,rax mov rax,rdi adc rdx,0 add r11,r8 adc rdx,0 mov r8,rdx mov rbp,r9 imul r9,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r12,rax mov rax,rdi adc rdx,0 add r12,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[32+rsi] add r13,rax mov rax,rdi adc rdx,0 add r13,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[40+rsi] add r14,r8 adc rdx,0 xor r8,r8 add r14,rax mov rax,r9 adc r15,rdx adc r8,0 mul QWORD PTR[rcx] add rbp,rax mov rax,r9 adc rbp,rdx mul QWORD PTR[8+rcx] add r10,rax mov rax,r9 adc rdx,0 add r10,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r11,rax mov rax,r9 adc rdx,0 add r11,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r12,rbp adc rdx,0 add r12,rax mov rax,r9 adc rdx,0 mov rbp,rdx mul QWORD PTR[32+rcx] add r13,rax mov rax,r9 adc rdx,0 add r13,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[40+rcx] add r14,rax mov rax,QWORD PTR[16+rbx] adc rdx,0 add r14,rbp adc r15,rdx adc r8,0 mov rdi,rax mul QWORD PTR[rsi] add r10,rax mov rax,rdi adc rdx,0 mov r9,rdx mul QWORD PTR[8+rsi] add r11,rax mov rax,rdi adc rdx,0 add r11,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[16+rsi] add r12,rax mov rax,rdi adc rdx,0 add r12,r9 adc rdx,0 mov r9,rdx mov rbp,r10 imul r10,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r13,rax mov rax,rdi adc rdx,0 add r13,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[32+rsi] add r14,rax mov rax,rdi adc rdx,0 add r14,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[40+rsi] add r15,r9 adc rdx,0 xor r9,r9 add r15,rax mov rax,r10 adc r8,rdx adc r9,0 mul QWORD PTR[rcx] add rbp,rax mov rax,r10 adc rbp,rdx mul QWORD PTR[8+rcx] add r11,rax mov rax,r10 adc rdx,0 add r11,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r12,rax mov rax,r10 adc rdx,0 add r12,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r13,rbp adc rdx,0 add r13,rax mov rax,r10 adc rdx,0 mov rbp,rdx mul QWORD PTR[32+rcx] add r14,rax mov rax,r10 adc rdx,0 add r14,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[40+rcx] add r15,rax mov rax,QWORD PTR[24+rbx] adc rdx,0 add r15,rbp adc r8,rdx adc r9,0 mov rdi,rax mul QWORD PTR[rsi] add r11,rax mov rax,rdi adc rdx,0 mov r10,rdx mul QWORD PTR[8+rsi] add r12,rax mov rax,rdi adc rdx,0 add r12,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[16+rsi] add r13,rax mov rax,rdi adc rdx,0 add r13,r10 adc rdx,0 mov r10,rdx mov rbp,r11 imul r11,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r14,rax mov rax,rdi adc rdx,0 add r14,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r15,rax mov rax,rdi adc rdx,0 add r15,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[40+rsi] add r8,r10 adc rdx,0 xor r10,r10 add r8,rax mov rax,r11 adc r9,rdx adc r10,0 mul QWORD PTR[rcx] add rbp,rax mov rax,r11 adc rbp,rdx mul QWORD PTR[8+rcx] add r12,rax mov rax,r11 adc rdx,0 add r12,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r13,rax mov rax,r11 adc rdx,0 add r13,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r14,rbp adc rdx,0 add r14,rax mov rax,r11 adc rdx,0 mov rbp,rdx mul QWORD PTR[32+rcx] add r15,rax mov rax,r11 adc rdx,0 add r15,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[40+rcx] add r8,rax mov rax,QWORD PTR[32+rbx] adc rdx,0 add r8,rbp adc r9,rdx adc r10,0 mov rdi,rax mul QWORD PTR[rsi] add r12,rax mov rax,rdi adc rdx,0 mov r11,rdx mul QWORD PTR[8+rsi] add r13,rax mov rax,rdi adc rdx,0 add r13,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[16+rsi] add r14,rax mov rax,rdi adc rdx,0 add r14,r11 adc rdx,0 mov r11,rdx mov rbp,r12 imul r12,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r15,rax mov rax,rdi adc rdx,0 add r15,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[32+rsi] add r8,rax mov rax,rdi adc rdx,0 add r8,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r9,r11 adc rdx,0 xor r11,r11 add r9,rax mov rax,r12 adc r10,rdx adc r11,0 mul QWORD PTR[rcx] add rbp,rax mov rax,r12 adc rbp,rdx mul QWORD PTR[8+rcx] add r13,rax mov rax,r12 adc rdx,0 add r13,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r14,rax mov rax,r12 adc rdx,0 add r14,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r15,rbp adc rdx,0 add r15,rax mov rax,r12 adc rdx,0 mov rbp,rdx mul QWORD PTR[32+rcx] add r8,rax mov rax,r12 adc rdx,0 add r8,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[40+rcx] add r9,rax mov rax,QWORD PTR[40+rbx] adc rdx,0 add r9,rbp adc r10,rdx adc r11,0 mov rdi,rax mul QWORD PTR[rsi] add r13,rax mov rax,rdi adc rdx,0 mov r12,rdx mul QWORD PTR[8+rsi] add r14,rax mov rax,rdi adc rdx,0 add r14,r12 adc rdx,0 mov r12,rdx mul QWORD PTR[16+rsi] add r15,rax mov rax,rdi adc rdx,0 add r15,r12 adc rdx,0 mov r12,rdx mov rbp,r13 imul r13,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r8,rax mov rax,rdi adc rdx,0 add r8,r12 adc rdx,0 mov r12,rdx mul QWORD PTR[32+rsi] add r9,rax mov rax,rdi adc rdx,0 add r9,r12 adc rdx,0 mov r12,rdx mul QWORD PTR[40+rsi] add r10,r12 adc rdx,0 xor r12,r12 add r10,rax mov rax,r13 adc r11,rdx adc r12,0 mul QWORD PTR[rcx] add rbp,rax mov rax,r13 adc rbp,rdx mul QWORD PTR[8+rcx] add r14,rax mov rax,r13 adc rdx,0 add r14,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[16+rcx] add r15,rax mov rax,r13 adc rdx,0 add r15,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[24+rcx] add r8,rbp adc rdx,0 add r8,rax mov rax,r13 adc rdx,0 mov rbp,rdx mul QWORD PTR[32+rcx] add r9,rax mov rax,r13 adc rdx,0 add r9,rbp adc rdx,0 mov rbp,rdx mul QWORD PTR[40+rcx] add r10,rax mov rax,r14 adc rdx,0 add r10,rbp adc r11,rdx adc r12,0 mov rdi,QWORD PTR[16+rsp] sub r14,QWORD PTR[rcx] mov rdx,r15 sbb r15,QWORD PTR[8+rcx] mov rbx,r8 sbb r8,QWORD PTR[16+rcx] mov rsi,r9 sbb r9,QWORD PTR[24+rcx] mov rbp,r10 sbb r10,QWORD PTR[32+rcx] mov r13,r11 sbb r11,QWORD PTR[40+rcx] sbb r12,0 cmovc r14,rax cmovc r15,rdx cmovc r8,rbx mov QWORD PTR[rdi],r14 cmovc r9,rsi mov QWORD PTR[8+rdi],r15 cmovc r10,rbp mov QWORD PTR[16+rdi],r8 cmovc r11,r13 mov QWORD PTR[24+rdi],r9 mov QWORD PTR[32+rdi],r10 mov QWORD PTR[40+rdi],r11 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulq_mont_384 ENDP PUBLIC sqr_n_mul_mont_384 ALIGN 32 sqr_n_mul_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_n_mul_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_n_mul_mont_384$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8*17 $L$SEH_body_sqr_n_mul_mont_384:: mov QWORD PTR[rsp],r8 mov QWORD PTR[8+rsp],rdi mov QWORD PTR[16+rsp],rcx lea rdi,QWORD PTR[32+rsp] mov QWORD PTR[24+rsp],r9 movq xmm2,QWORD PTR[r9] $L$oop_sqr_384:: movd xmm1,edx call __sqrq_384 lea rsi,QWORD PTR[rdi] mov rcx,QWORD PTR[rsp] mov rbx,QWORD PTR[16+rsp] call __mulq_by_1_mont_384 call __redq_tail_mont_384 movd edx,xmm1 lea rsi,QWORD PTR[rdi] dec edx jnz $L$oop_sqr_384 DB 102,72,15,126,208 mov rcx,rbx mov rbx,QWORD PTR[24+rsp] mov r12,r8 mov r13,r9 call __mulq_mont_384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[136+rsp] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_sqr_n_mul_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_n_mul_mont_384:: sqr_n_mul_mont_384 ENDP PUBLIC sqr_n_mul_mont_383 ALIGN 32 sqr_n_mul_mont_383 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_n_mul_mont_383:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_n_mul_mont_383$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8*17 $L$SEH_body_sqr_n_mul_mont_383:: mov QWORD PTR[rsp],r8 mov QWORD PTR[8+rsp],rdi mov QWORD PTR[16+rsp],rcx lea rdi,QWORD PTR[32+rsp] mov QWORD PTR[24+rsp],r9 movq xmm2,QWORD PTR[r9] $L$oop_sqr_383:: movd xmm1,edx call __sqrq_384 lea rsi,QWORD PTR[rdi] mov rcx,QWORD PTR[rsp] mov rbx,QWORD PTR[16+rsp] call __mulq_by_1_mont_384 movd edx,xmm1 add r14,QWORD PTR[48+rsi] adc r15,QWORD PTR[56+rsi] adc r8,QWORD PTR[64+rsi] adc r9,QWORD PTR[72+rsi] adc r10,QWORD PTR[80+rsi] adc r11,QWORD PTR[88+rsi] lea rsi,QWORD PTR[rdi] mov QWORD PTR[rdi],r14 mov QWORD PTR[8+rdi],r15 mov QWORD PTR[16+rdi],r8 mov QWORD PTR[24+rdi],r9 mov QWORD PTR[32+rdi],r10 mov QWORD PTR[40+rdi],r11 dec edx jnz $L$oop_sqr_383 DB 102,72,15,126,208 mov rcx,rbx mov rbx,QWORD PTR[24+rsp] mov r12,r8 mov r13,r9 call __mulq_mont_384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[136+rsp] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_sqr_n_mul_mont_383:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_n_mul_mont_383:: sqr_n_mul_mont_383 ENDP ALIGN 32 __mulq_mont_383_nonred PROC PRIVATE DB 243,15,30,250 mov rbp,rax mul r14 mov r8,rax mov rax,rbp mov r9,rdx mul r15 add r9,rax mov rax,rbp adc rdx,0 mov r10,rdx mul r12 add r10,rax mov rax,rbp adc rdx,0 mov r11,rdx mov r15,r8 imul r8,QWORD PTR[8+rsp] mul r13 add r11,rax mov rax,rbp adc rdx,0 mov r12,rdx mul QWORD PTR[32+rsi] add r12,rax mov rax,rbp adc rdx,0 mov r13,rdx mul QWORD PTR[40+rsi] add r13,rax mov rax,r8 adc rdx,0 mov r14,rdx mul QWORD PTR[rcx] add r15,rax mov rax,r8 adc r15,rdx mul QWORD PTR[8+rcx] add r9,rax mov rax,r8 adc rdx,0 add r9,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[16+rcx] add r10,rax mov rax,r8 adc rdx,0 add r10,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[24+rcx] add r11,r15 adc rdx,0 add r11,rax mov rax,r8 adc rdx,0 mov r15,rdx mul QWORD PTR[32+rcx] add r12,rax mov rax,r8 adc rdx,0 add r12,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[40+rcx] add r13,rax mov rax,QWORD PTR[8+rbx] adc rdx,0 add r13,r15 adc r14,rdx mov rbp,rax mul QWORD PTR[rsi] add r9,rax mov rax,rbp adc rdx,0 mov r15,rdx mul QWORD PTR[8+rsi] add r10,rax mov rax,rbp adc rdx,0 add r10,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[16+rsi] add r11,rax mov rax,rbp adc rdx,0 add r11,r15 adc rdx,0 mov r15,rdx mov r8,r9 imul r9,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r12,rax mov rax,rbp adc rdx,0 add r12,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[32+rsi] add r13,rax mov rax,rbp adc rdx,0 add r13,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[40+rsi] add r14,r15 adc rdx,0 add r14,rax mov rax,r9 adc rdx,0 mov r15,rdx mul QWORD PTR[rcx] add r8,rax mov rax,r9 adc r8,rdx mul QWORD PTR[8+rcx] add r10,rax mov rax,r9 adc rdx,0 add r10,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rcx] add r11,rax mov rax,r9 adc rdx,0 add r11,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[24+rcx] add r12,r8 adc rdx,0 add r12,rax mov rax,r9 adc rdx,0 mov r8,rdx mul QWORD PTR[32+rcx] add r13,rax mov rax,r9 adc rdx,0 add r13,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[40+rcx] add r14,rax mov rax,QWORD PTR[16+rbx] adc rdx,0 add r14,r8 adc r15,rdx mov rbp,rax mul QWORD PTR[rsi] add r10,rax mov rax,rbp adc rdx,0 mov r8,rdx mul QWORD PTR[8+rsi] add r11,rax mov rax,rbp adc rdx,0 add r11,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[16+rsi] add r12,rax mov rax,rbp adc rdx,0 add r12,r8 adc rdx,0 mov r8,rdx mov r9,r10 imul r10,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r13,rax mov rax,rbp adc rdx,0 add r13,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[32+rsi] add r14,rax mov rax,rbp adc rdx,0 add r14,r8 adc rdx,0 mov r8,rdx mul QWORD PTR[40+rsi] add r15,r8 adc rdx,0 add r15,rax mov rax,r10 adc rdx,0 mov r8,rdx mul QWORD PTR[rcx] add r9,rax mov rax,r10 adc r9,rdx mul QWORD PTR[8+rcx] add r11,rax mov rax,r10 adc rdx,0 add r11,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[16+rcx] add r12,rax mov rax,r10 adc rdx,0 add r12,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[24+rcx] add r13,r9 adc rdx,0 add r13,rax mov rax,r10 adc rdx,0 mov r9,rdx mul QWORD PTR[32+rcx] add r14,rax mov rax,r10 adc rdx,0 add r14,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[40+rcx] add r15,rax mov rax,QWORD PTR[24+rbx] adc rdx,0 add r15,r9 adc r8,rdx mov rbp,rax mul QWORD PTR[rsi] add r11,rax mov rax,rbp adc rdx,0 mov r9,rdx mul QWORD PTR[8+rsi] add r12,rax mov rax,rbp adc rdx,0 add r12,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[16+rsi] add r13,rax mov rax,rbp adc rdx,0 add r13,r9 adc rdx,0 mov r9,rdx mov r10,r11 imul r11,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r14,rax mov rax,rbp adc rdx,0 add r14,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[32+rsi] add r15,rax mov rax,rbp adc rdx,0 add r15,r9 adc rdx,0 mov r9,rdx mul QWORD PTR[40+rsi] add r8,r9 adc rdx,0 add r8,rax mov rax,r11 adc rdx,0 mov r9,rdx mul QWORD PTR[rcx] add r10,rax mov rax,r11 adc r10,rdx mul QWORD PTR[8+rcx] add r12,rax mov rax,r11 adc rdx,0 add r12,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[16+rcx] add r13,rax mov rax,r11 adc rdx,0 add r13,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[24+rcx] add r14,r10 adc rdx,0 add r14,rax mov rax,r11 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rcx] add r15,rax mov rax,r11 adc rdx,0 add r15,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[40+rcx] add r8,rax mov rax,QWORD PTR[32+rbx] adc rdx,0 add r8,r10 adc r9,rdx mov rbp,rax mul QWORD PTR[rsi] add r12,rax mov rax,rbp adc rdx,0 mov r10,rdx mul QWORD PTR[8+rsi] add r13,rax mov rax,rbp adc rdx,0 add r13,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[16+rsi] add r14,rax mov rax,rbp adc rdx,0 add r14,r10 adc rdx,0 mov r10,rdx mov r11,r12 imul r12,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r15,rax mov rax,rbp adc rdx,0 add r15,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[32+rsi] add r8,rax mov rax,rbp adc rdx,0 add r8,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[40+rsi] add r9,r10 adc rdx,0 add r9,rax mov rax,r12 adc rdx,0 mov r10,rdx mul QWORD PTR[rcx] add r11,rax mov rax,r12 adc r11,rdx mul QWORD PTR[8+rcx] add r13,rax mov rax,r12 adc rdx,0 add r13,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[16+rcx] add r14,rax mov rax,r12 adc rdx,0 add r14,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[24+rcx] add r15,r11 adc rdx,0 add r15,rax mov rax,r12 adc rdx,0 mov r11,rdx mul QWORD PTR[32+rcx] add r8,rax mov rax,r12 adc rdx,0 add r8,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rcx] add r9,rax mov rax,QWORD PTR[40+rbx] adc rdx,0 add r9,r11 adc r10,rdx mov rbp,rax mul QWORD PTR[rsi] add r13,rax mov rax,rbp adc rdx,0 mov r11,rdx mul QWORD PTR[8+rsi] add r14,rax mov rax,rbp adc rdx,0 add r14,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[16+rsi] add r15,rax mov rax,rbp adc rdx,0 add r15,r11 adc rdx,0 mov r11,rdx mov r12,r13 imul r13,QWORD PTR[8+rsp] mul QWORD PTR[24+rsi] add r8,rax mov rax,rbp adc rdx,0 add r8,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[32+rsi] add r9,rax mov rax,rbp adc rdx,0 add r9,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[40+rsi] add r10,r11 adc rdx,0 add r10,rax mov rax,r13 adc rdx,0 mov r11,rdx mul QWORD PTR[rcx] add r12,rax mov rax,r13 adc r12,rdx mul QWORD PTR[8+rcx] add r14,rax mov rax,r13 adc rdx,0 add r14,r12 adc rdx,0 mov r12,rdx mul QWORD PTR[16+rcx] add r15,rax mov rax,r13 adc rdx,0 add r15,r12 adc rdx,0 mov r12,rdx mul QWORD PTR[24+rcx] add r8,r12 adc rdx,0 add r8,rax mov rax,r13 adc rdx,0 mov r12,rdx mul QWORD PTR[32+rcx] add r9,rax mov rax,r13 adc rdx,0 add r9,r12 adc rdx,0 mov r12,rdx mul QWORD PTR[40+rcx] add r10,rax mov rax,r14 adc rdx,0 add r10,r12 adc r11,rdx ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulq_mont_383_nonred ENDP PUBLIC sqr_mont_382x ALIGN 32 sqr_mont_382x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_382x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 ifdef __BLST_PORTABLE__ test DWORD PTR[__blst_platform_cap],1 jnz sqr_mont_382x$1 endif push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,136 $L$SEH_body_sqr_mont_382x:: mov QWORD PTR[rsp],rcx mov rcx,rdx mov QWORD PTR[16+rsp],rsi mov QWORD PTR[24+rsp],rdi mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,r8 add r8,QWORD PTR[48+rsi] mov r15,r9 adc r9,QWORD PTR[56+rsi] mov rax,r10 adc r10,QWORD PTR[64+rsi] mov rdx,r11 adc r11,QWORD PTR[72+rsi] mov rbx,r12 adc r12,QWORD PTR[80+rsi] mov rbp,r13 adc r13,QWORD PTR[88+rsi] sub r14,QWORD PTR[48+rsi] sbb r15,QWORD PTR[56+rsi] sbb rax,QWORD PTR[64+rsi] sbb rdx,QWORD PTR[72+rsi] sbb rbx,QWORD PTR[80+rsi] sbb rbp,QWORD PTR[88+rsi] sbb rdi,rdi mov QWORD PTR[((32+0))+rsp],r8 mov QWORD PTR[((32+8))+rsp],r9 mov QWORD PTR[((32+16))+rsp],r10 mov QWORD PTR[((32+24))+rsp],r11 mov QWORD PTR[((32+32))+rsp],r12 mov QWORD PTR[((32+40))+rsp],r13 mov QWORD PTR[((32+48))+rsp],r14 mov QWORD PTR[((32+56))+rsp],r15 mov QWORD PTR[((32+64))+rsp],rax mov QWORD PTR[((32+72))+rsp],rdx mov QWORD PTR[((32+80))+rsp],rbx mov QWORD PTR[((32+88))+rsp],rbp mov QWORD PTR[((32+96))+rsp],rdi lea rbx,QWORD PTR[48+rsi] mov rax,QWORD PTR[48+rsi] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov r12,QWORD PTR[16+rsi] mov r13,QWORD PTR[24+rsi] mov rdi,QWORD PTR[24+rsp] call __mulq_mont_383_nonred add r14,r14 adc r15,r15 adc r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 mov QWORD PTR[64+rdi],r8 mov QWORD PTR[72+rdi],r9 mov QWORD PTR[80+rdi],r10 mov QWORD PTR[88+rdi],r11 lea rsi,QWORD PTR[32+rsp] lea rbx,QWORD PTR[((32+48))+rsp] mov rax,QWORD PTR[((32+48))+rsp] mov r14,QWORD PTR[((32+0))+rsp] mov r15,QWORD PTR[((32+8))+rsp] mov r12,QWORD PTR[((32+16))+rsp] mov r13,QWORD PTR[((32+24))+rsp] call __mulq_mont_383_nonred mov rsi,QWORD PTR[((32+96))+rsp] mov r12,QWORD PTR[((32+0))+rsp] mov r13,QWORD PTR[((32+8))+rsp] and r12,rsi mov rax,QWORD PTR[((32+16))+rsp] and r13,rsi mov rbx,QWORD PTR[((32+24))+rsp] and rax,rsi mov rbp,QWORD PTR[((32+32))+rsp] and rbx,rsi and rbp,rsi and rsi,QWORD PTR[((32+40))+rsp] sub r14,r12 mov r12,QWORD PTR[rcx] sbb r15,r13 mov r13,QWORD PTR[8+rcx] sbb r8,rax mov rax,QWORD PTR[16+rcx] sbb r9,rbx mov rbx,QWORD PTR[24+rcx] sbb r10,rbp mov rbp,QWORD PTR[32+rcx] sbb r11,rsi sbb rsi,rsi and r12,rsi and r13,rsi and rax,rsi and rbx,rsi and rbp,rsi and rsi,QWORD PTR[40+rcx] add r14,r12 adc r15,r13 adc r8,rax adc r9,rbx adc r10,rbp adc r11,rsi mov QWORD PTR[rdi],r14 mov QWORD PTR[8+rdi],r15 mov QWORD PTR[16+rdi],r8 mov QWORD PTR[24+rdi],r9 mov QWORD PTR[32+rdi],r10 mov QWORD PTR[40+rdi],r11 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_sqr_mont_382x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqr_mont_382x:: sqr_mont_382x ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_mul_mont_384x DD imagerel $L$SEH_body_mul_mont_384x DD imagerel $L$SEH_info_mul_mont_384x_prologue DD imagerel $L$SEH_body_mul_mont_384x DD imagerel $L$SEH_epilogue_mul_mont_384x DD imagerel $L$SEH_info_mul_mont_384x_body DD imagerel $L$SEH_epilogue_mul_mont_384x DD imagerel $L$SEH_end_mul_mont_384x DD imagerel $L$SEH_info_mul_mont_384x_epilogue DD imagerel $L$SEH_begin_sqr_mont_384x DD imagerel $L$SEH_body_sqr_mont_384x DD imagerel $L$SEH_info_sqr_mont_384x_prologue DD imagerel $L$SEH_body_sqr_mont_384x DD imagerel $L$SEH_epilogue_sqr_mont_384x DD imagerel $L$SEH_info_sqr_mont_384x_body DD imagerel $L$SEH_epilogue_sqr_mont_384x DD imagerel $L$SEH_end_sqr_mont_384x DD imagerel $L$SEH_info_sqr_mont_384x_epilogue DD imagerel $L$SEH_begin_mul_382x DD imagerel $L$SEH_body_mul_382x DD imagerel $L$SEH_info_mul_382x_prologue DD imagerel $L$SEH_body_mul_382x DD imagerel $L$SEH_epilogue_mul_382x DD imagerel $L$SEH_info_mul_382x_body DD imagerel $L$SEH_epilogue_mul_382x DD imagerel $L$SEH_end_mul_382x DD imagerel $L$SEH_info_mul_382x_epilogue DD imagerel $L$SEH_begin_sqr_382x DD imagerel $L$SEH_body_sqr_382x DD imagerel $L$SEH_info_sqr_382x_prologue DD imagerel $L$SEH_body_sqr_382x DD imagerel $L$SEH_epilogue_sqr_382x DD imagerel $L$SEH_info_sqr_382x_body DD imagerel $L$SEH_epilogue_sqr_382x DD imagerel $L$SEH_end_sqr_382x DD imagerel $L$SEH_info_sqr_382x_epilogue DD imagerel $L$SEH_begin_mul_384 DD imagerel $L$SEH_body_mul_384 DD imagerel $L$SEH_info_mul_384_prologue DD imagerel $L$SEH_body_mul_384 DD imagerel $L$SEH_epilogue_mul_384 DD imagerel $L$SEH_info_mul_384_body DD imagerel $L$SEH_epilogue_mul_384 DD imagerel $L$SEH_end_mul_384 DD imagerel $L$SEH_info_mul_384_epilogue DD imagerel $L$SEH_begin_sqr_384 DD imagerel $L$SEH_body_sqr_384 DD imagerel $L$SEH_info_sqr_384_prologue DD imagerel $L$SEH_body_sqr_384 DD imagerel $L$SEH_epilogue_sqr_384 DD imagerel $L$SEH_info_sqr_384_body DD imagerel $L$SEH_epilogue_sqr_384 DD imagerel $L$SEH_end_sqr_384 DD imagerel $L$SEH_info_sqr_384_epilogue DD imagerel $L$SEH_begin_sqr_mont_384 DD imagerel $L$SEH_body_sqr_mont_384 DD imagerel $L$SEH_info_sqr_mont_384_prologue DD imagerel $L$SEH_body_sqr_mont_384 DD imagerel $L$SEH_epilogue_sqr_mont_384 DD imagerel $L$SEH_info_sqr_mont_384_body DD imagerel $L$SEH_epilogue_sqr_mont_384 DD imagerel $L$SEH_end_sqr_mont_384 DD imagerel $L$SEH_info_sqr_mont_384_epilogue DD imagerel $L$SEH_begin_redc_mont_384 DD imagerel $L$SEH_body_redc_mont_384 DD imagerel $L$SEH_info_redc_mont_384_prologue DD imagerel $L$SEH_body_redc_mont_384 DD imagerel $L$SEH_epilogue_redc_mont_384 DD imagerel $L$SEH_info_redc_mont_384_body DD imagerel $L$SEH_epilogue_redc_mont_384 DD imagerel $L$SEH_end_redc_mont_384 DD imagerel $L$SEH_info_redc_mont_384_epilogue DD imagerel $L$SEH_begin_from_mont_384 DD imagerel $L$SEH_body_from_mont_384 DD imagerel $L$SEH_info_from_mont_384_prologue DD imagerel $L$SEH_body_from_mont_384 DD imagerel $L$SEH_epilogue_from_mont_384 DD imagerel $L$SEH_info_from_mont_384_body DD imagerel $L$SEH_epilogue_from_mont_384 DD imagerel $L$SEH_end_from_mont_384 DD imagerel $L$SEH_info_from_mont_384_epilogue DD imagerel $L$SEH_begin_sgn0_pty_mont_384 DD imagerel $L$SEH_body_sgn0_pty_mont_384 DD imagerel $L$SEH_info_sgn0_pty_mont_384_prologue DD imagerel $L$SEH_body_sgn0_pty_mont_384 DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 DD imagerel $L$SEH_info_sgn0_pty_mont_384_body DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 DD imagerel $L$SEH_end_sgn0_pty_mont_384 DD imagerel $L$SEH_info_sgn0_pty_mont_384_epilogue DD imagerel $L$SEH_begin_sgn0_pty_mont_384x DD imagerel $L$SEH_body_sgn0_pty_mont_384x DD imagerel $L$SEH_info_sgn0_pty_mont_384x_prologue DD imagerel $L$SEH_body_sgn0_pty_mont_384x DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x DD imagerel $L$SEH_info_sgn0_pty_mont_384x_body DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x DD imagerel $L$SEH_end_sgn0_pty_mont_384x DD imagerel $L$SEH_info_sgn0_pty_mont_384x_epilogue DD imagerel $L$SEH_begin_mul_mont_384 DD imagerel $L$SEH_body_mul_mont_384 DD imagerel $L$SEH_info_mul_mont_384_prologue DD imagerel $L$SEH_body_mul_mont_384 DD imagerel $L$SEH_epilogue_mul_mont_384 DD imagerel $L$SEH_info_mul_mont_384_body DD imagerel $L$SEH_epilogue_mul_mont_384 DD imagerel $L$SEH_end_mul_mont_384 DD imagerel $L$SEH_info_mul_mont_384_epilogue DD imagerel $L$SEH_begin_sqr_n_mul_mont_384 DD imagerel $L$SEH_body_sqr_n_mul_mont_384 DD imagerel $L$SEH_info_sqr_n_mul_mont_384_prologue DD imagerel $L$SEH_body_sqr_n_mul_mont_384 DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 DD imagerel $L$SEH_info_sqr_n_mul_mont_384_body DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 DD imagerel $L$SEH_end_sqr_n_mul_mont_384 DD imagerel $L$SEH_info_sqr_n_mul_mont_384_epilogue DD imagerel $L$SEH_begin_sqr_n_mul_mont_383 DD imagerel $L$SEH_body_sqr_n_mul_mont_383 DD imagerel $L$SEH_info_sqr_n_mul_mont_383_prologue DD imagerel $L$SEH_body_sqr_n_mul_mont_383 DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 DD imagerel $L$SEH_info_sqr_n_mul_mont_383_body DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 DD imagerel $L$SEH_end_sqr_n_mul_mont_383 DD imagerel $L$SEH_info_sqr_n_mul_mont_383_epilogue DD imagerel $L$SEH_begin_sqr_mont_382x DD imagerel $L$SEH_body_sqr_mont_382x DD imagerel $L$SEH_info_sqr_mont_382x_prologue DD imagerel $L$SEH_body_sqr_mont_382x DD imagerel $L$SEH_epilogue_sqr_mont_382x DD imagerel $L$SEH_info_sqr_mont_382x_body DD imagerel $L$SEH_epilogue_sqr_mont_382x DD imagerel $L$SEH_end_sqr_mont_382x DD imagerel $L$SEH_info_sqr_mont_382x_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_mul_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,029h,000h DB 000h,0e4h,02ah,000h DB 000h,0d4h,02bh,000h DB 000h,0c4h,02ch,000h DB 000h,034h,02dh,000h DB 000h,054h,02eh,000h DB 000h,074h,030h,000h DB 000h,064h,031h,000h DB 000h,001h,02fh,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_382x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_384_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h DB 000h,034h,001h,000h DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h DB 000h,000h,000h,000h,000h,000h $L$SEH_info_mul_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_mont_384_body:: DB 1,0,18,0 DB 000h,0f4h,00fh,000h DB 000h,0e4h,010h,000h DB 000h,0d4h,011h,000h DB 000h,0c4h,012h,000h DB 000h,034h,013h,000h DB 000h,054h,014h,000h DB 000h,074h,016h,000h DB 000h,064h,017h,000h DB 000h,001h,015h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_redc_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_redc_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_redc_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_from_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_from_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_from_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sgn0_pty_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sgn0_pty_mont_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mul_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h DB 000h,0e4h,004h,000h DB 000h,0d4h,005h,000h DB 000h,0c4h,006h,000h DB 000h,034h,007h,000h DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mul_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_n_mul_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_n_mul_mont_384_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_n_mul_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_n_mul_mont_383_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_n_mul_mont_383_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_n_mul_mont_383_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqr_mont_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/mulx_mont_256-x86_64.asm ================================================ OPTION DOTNAME PUBLIC mul_mont_sparse_256$1 PUBLIC sqr_mont_sparse_256$1 PUBLIC from_mont_256$1 PUBLIC redc_mont_256$1 .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC mulx_mont_sparse_256 ALIGN 32 mulx_mont_sparse_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_mont_sparse_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mul_mont_sparse_256$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_mulx_mont_sparse_256:: mov rbx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[rdx] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rbp,QWORD PTR[16+rsi] mov r9,QWORD PTR[24+rsi] lea rsi,QWORD PTR[((-128))+rsi] lea rcx,QWORD PTR[((-128))+rcx] mulx r11,rax,r14 call __mulx_mont_sparse_256 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_mulx_mont_sparse_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mulx_mont_sparse_256:: mulx_mont_sparse_256 ENDP PUBLIC sqrx_mont_sparse_256 ALIGN 32 sqrx_mont_sparse_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_sparse_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 sqr_mont_sparse_256$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sqrx_mont_sparse_256:: mov rbx,rsi mov r8,rcx mov rcx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rbp,QWORD PTR[16+rsi] mov r9,QWORD PTR[24+rsi] lea rsi,QWORD PTR[((-128))+rbx] lea rcx,QWORD PTR[((-128))+rcx] mulx r11,rax,rdx call __mulx_mont_sparse_256 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sqrx_mont_sparse_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_mont_sparse_256:: sqrx_mont_sparse_256 ENDP ALIGN 32 __mulx_mont_sparse_256 PROC PRIVATE DB 243,15,30,250 mulx r12,r15,r15 mulx r13,rbp,rbp add r11,r15 mulx r14,r9,r9 mov rdx,QWORD PTR[8+rbx] adc r12,rbp adc r13,r9 adc r14,0 mov r10,rax imul rax,r8 xor r15,r15 mulx r9,rbp,QWORD PTR[((0+128))+rsi] adox r11,rbp adcx r12,r9 mulx r9,rbp,QWORD PTR[((8+128))+rsi] adox r12,rbp adcx r13,r9 mulx r9,rbp,QWORD PTR[((16+128))+rsi] adox r13,rbp adcx r14,r9 mulx r9,rbp,QWORD PTR[((24+128))+rsi] mov rdx,rax adox r14,rbp adcx r9,r15 adox r15,r9 mulx rax,rbp,QWORD PTR[((0+128))+rcx] adcx r10,rbp adox rax,r11 mulx r9,rbp,QWORD PTR[((8+128))+rcx] adcx rax,rbp adox r12,r9 mulx r9,rbp,QWORD PTR[((16+128))+rcx] adcx r12,rbp adox r13,r9 mulx r9,rbp,QWORD PTR[((24+128))+rcx] mov rdx,QWORD PTR[16+rbx] adcx r13,rbp adox r14,r9 adcx r14,r10 adox r15,r10 adcx r15,r10 adox r10,r10 adc r10,0 mov r11,rax imul rax,r8 xor rbp,rbp mulx r9,rbp,QWORD PTR[((0+128))+rsi] adox r12,rbp adcx r13,r9 mulx r9,rbp,QWORD PTR[((8+128))+rsi] adox r13,rbp adcx r14,r9 mulx r9,rbp,QWORD PTR[((16+128))+rsi] adox r14,rbp adcx r15,r9 mulx r9,rbp,QWORD PTR[((24+128))+rsi] mov rdx,rax adox r15,rbp adcx r9,r10 adox r10,r9 mulx rax,rbp,QWORD PTR[((0+128))+rcx] adcx r11,rbp adox rax,r12 mulx r9,rbp,QWORD PTR[((8+128))+rcx] adcx rax,rbp adox r13,r9 mulx r9,rbp,QWORD PTR[((16+128))+rcx] adcx r13,rbp adox r14,r9 mulx r9,rbp,QWORD PTR[((24+128))+rcx] mov rdx,QWORD PTR[24+rbx] adcx r14,rbp adox r15,r9 adcx r15,r11 adox r10,r11 adcx r10,r11 adox r11,r11 adc r11,0 mov r12,rax imul rax,r8 xor rbp,rbp mulx r9,rbp,QWORD PTR[((0+128))+rsi] adox r13,rbp adcx r14,r9 mulx r9,rbp,QWORD PTR[((8+128))+rsi] adox r14,rbp adcx r15,r9 mulx r9,rbp,QWORD PTR[((16+128))+rsi] adox r15,rbp adcx r10,r9 mulx r9,rbp,QWORD PTR[((24+128))+rsi] mov rdx,rax adox r10,rbp adcx r9,r11 adox r11,r9 mulx rax,rbp,QWORD PTR[((0+128))+rcx] adcx r12,rbp adox rax,r13 mulx r9,rbp,QWORD PTR[((8+128))+rcx] adcx rax,rbp adox r14,r9 mulx r9,rbp,QWORD PTR[((16+128))+rcx] adcx r14,rbp adox r15,r9 mulx r9,rbp,QWORD PTR[((24+128))+rcx] mov rdx,rax adcx r15,rbp adox r10,r9 adcx r10,r12 adox r11,r12 adcx r11,r12 adox r12,r12 adc r12,0 imul rdx,r8 xor rbp,rbp mulx r9,r13,QWORD PTR[((0+128))+rcx] adcx r13,rax adox r14,r9 mulx r9,rbp,QWORD PTR[((8+128))+rcx] adcx r14,rbp adox r15,r9 mulx r9,rbp,QWORD PTR[((16+128))+rcx] adcx r15,rbp adox r10,r9 mulx r9,rbp,QWORD PTR[((24+128))+rcx] mov rdx,r14 lea rcx,QWORD PTR[128+rcx] adcx r10,rbp adox r11,r9 mov rax,r15 adcx r11,r13 adox r12,r13 adc r12,0 mov rbp,r10 sub r14,QWORD PTR[rcx] sbb r15,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rcx] mov r9,r11 sbb r11,QWORD PTR[24+rcx] sbb r12,0 cmovc r14,rdx cmovc r15,rax cmovc r10,rbp mov QWORD PTR[rdi],r14 cmovc r11,r9 mov QWORD PTR[8+rdi],r15 mov QWORD PTR[16+rdi],r10 mov QWORD PTR[24+rdi],r11 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulx_mont_sparse_256 ENDP PUBLIC fromx_mont_256 ALIGN 32 fromx_mont_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_fromx_mont_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 from_mont_256$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_fromx_mont_256:: mov rbx,rdx call __mulx_by_1_mont_256 mov rdx,r15 mov r12,r10 mov r13,r11 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r10,QWORD PTR[16+rbx] sbb r11,QWORD PTR[24+rbx] cmovnc rax,r14 cmovnc rdx,r15 cmovnc r12,r10 mov QWORD PTR[rdi],rax cmovnc r13,r11 mov QWORD PTR[8+rdi],rdx mov QWORD PTR[16+rdi],r12 mov QWORD PTR[24+rdi],r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_fromx_mont_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_fromx_mont_256:: fromx_mont_256 ENDP PUBLIC redcx_mont_256 ALIGN 32 redcx_mont_256 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redcx_mont_256:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 redc_mont_256$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_redcx_mont_256:: mov rbx,rdx call __mulx_by_1_mont_256 add r14,QWORD PTR[32+rsi] adc r15,QWORD PTR[40+rsi] mov rax,r14 adc r10,QWORD PTR[48+rsi] mov rdx,r15 adc r11,QWORD PTR[56+rsi] sbb rsi,rsi mov r12,r10 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r10,QWORD PTR[16+rbx] mov r13,r11 sbb r11,QWORD PTR[24+rbx] sbb rsi,0 cmovnc rax,r14 cmovnc rdx,r15 cmovnc r12,r10 mov QWORD PTR[rdi],rax cmovnc r13,r11 mov QWORD PTR[8+rdi],rdx mov QWORD PTR[16+rdi],r12 mov QWORD PTR[24+rdi],r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_redcx_mont_256:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_redcx_mont_256:: redcx_mont_256 ENDP ALIGN 32 __mulx_by_1_mont_256 PROC PRIVATE DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif mov rax,QWORD PTR[rsi] mov r11,QWORD PTR[8+rsi] mov r12,QWORD PTR[16+rsi] mov r13,QWORD PTR[24+rsi] mov r14,rax imul rax,rcx mov r10,rax mul QWORD PTR[rbx] add r14,rax mov rax,r10 adc r14,rdx mul QWORD PTR[8+rbx] add r11,rax mov rax,r10 adc rdx,0 add r11,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[16+rbx] mov r15,r11 imul r11,rcx add r12,rax mov rax,r10 adc rdx,0 add r12,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[24+rbx] add r13,rax mov rax,r11 adc rdx,0 add r13,r14 adc rdx,0 mov r14,rdx mul QWORD PTR[rbx] add r15,rax mov rax,r11 adc r15,rdx mul QWORD PTR[8+rbx] add r12,rax mov rax,r11 adc rdx,0 add r12,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[16+rbx] mov r10,r12 imul r12,rcx add r13,rax mov rax,r11 adc rdx,0 add r13,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[24+rbx] add r14,rax mov rax,r12 adc rdx,0 add r14,r15 adc rdx,0 mov r15,rdx mul QWORD PTR[rbx] add r10,rax mov rax,r12 adc r10,rdx mul QWORD PTR[8+rbx] add r13,rax mov rax,r12 adc rdx,0 add r13,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[16+rbx] mov r11,r13 imul r13,rcx add r14,rax mov rax,r12 adc rdx,0 add r14,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[24+rbx] add r15,rax mov rax,r13 adc rdx,0 add r15,r10 adc rdx,0 mov r10,rdx mul QWORD PTR[rbx] add r11,rax mov rax,r13 adc r11,rdx mul QWORD PTR[8+rbx] add r14,rax mov rax,r13 adc rdx,0 add r14,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[16+rbx] add r15,rax mov rax,r13 adc rdx,0 add r15,r11 adc rdx,0 mov r11,rdx mul QWORD PTR[24+rbx] add r10,rax mov rax,r14 adc rdx,0 add r10,r11 adc rdx,0 mov r11,rdx ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulx_by_1_mont_256 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_mulx_mont_sparse_256 DD imagerel $L$SEH_body_mulx_mont_sparse_256 DD imagerel $L$SEH_info_mulx_mont_sparse_256_prologue DD imagerel $L$SEH_body_mulx_mont_sparse_256 DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 DD imagerel $L$SEH_info_mulx_mont_sparse_256_body DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 DD imagerel $L$SEH_end_mulx_mont_sparse_256 DD imagerel $L$SEH_info_mulx_mont_sparse_256_epilogue DD imagerel $L$SEH_begin_sqrx_mont_sparse_256 DD imagerel $L$SEH_body_sqrx_mont_sparse_256 DD imagerel $L$SEH_info_sqrx_mont_sparse_256_prologue DD imagerel $L$SEH_body_sqrx_mont_sparse_256 DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 DD imagerel $L$SEH_info_sqrx_mont_sparse_256_body DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 DD imagerel $L$SEH_end_sqrx_mont_sparse_256 DD imagerel $L$SEH_info_sqrx_mont_sparse_256_epilogue DD imagerel $L$SEH_begin_fromx_mont_256 DD imagerel $L$SEH_body_fromx_mont_256 DD imagerel $L$SEH_info_fromx_mont_256_prologue DD imagerel $L$SEH_body_fromx_mont_256 DD imagerel $L$SEH_epilogue_fromx_mont_256 DD imagerel $L$SEH_info_fromx_mont_256_body DD imagerel $L$SEH_epilogue_fromx_mont_256 DD imagerel $L$SEH_end_fromx_mont_256 DD imagerel $L$SEH_info_fromx_mont_256_epilogue DD imagerel $L$SEH_begin_redcx_mont_256 DD imagerel $L$SEH_body_redcx_mont_256 DD imagerel $L$SEH_info_redcx_mont_256_prologue DD imagerel $L$SEH_body_redcx_mont_256 DD imagerel $L$SEH_epilogue_redcx_mont_256 DD imagerel $L$SEH_info_redcx_mont_256_body DD imagerel $L$SEH_epilogue_redcx_mont_256 DD imagerel $L$SEH_end_redcx_mont_256 DD imagerel $L$SEH_info_redcx_mont_256_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_mulx_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mulx_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_fromx_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_fromx_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_fromx_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_redcx_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_redcx_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_redcx_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/mulx_mont_384-x86_64.asm ================================================ OPTION DOTNAME PUBLIC mul_mont_384x$1 PUBLIC sqr_mont_384x$1 PUBLIC mul_382x$1 PUBLIC sqr_382x$1 PUBLIC mul_384$1 PUBLIC sqr_384$1 PUBLIC redc_mont_384$1 PUBLIC from_mont_384$1 PUBLIC sgn0_pty_mont_384$1 PUBLIC sgn0_pty_mont_384x$1 PUBLIC mul_mont_384$1 PUBLIC sqr_mont_384$1 PUBLIC sqr_n_mul_mont_384$1 PUBLIC sqr_n_mul_mont_383$1 PUBLIC sqr_mont_382x$1 .text$ SEGMENT ALIGN(256) 'CODE' ALIGN 32 __subx_mod_384x384 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,QWORD PTR[48+rsi] sub r8,QWORD PTR[rdx] mov r15,QWORD PTR[56+rsi] sbb r9,QWORD PTR[8+rdx] mov rax,QWORD PTR[64+rsi] sbb r10,QWORD PTR[16+rdx] mov rbx,QWORD PTR[72+rsi] sbb r11,QWORD PTR[24+rdx] mov rbp,QWORD PTR[80+rsi] sbb r12,QWORD PTR[32+rdx] mov rsi,QWORD PTR[88+rsi] sbb r13,QWORD PTR[40+rdx] mov QWORD PTR[rdi],r8 sbb r14,QWORD PTR[48+rdx] mov r8,QWORD PTR[rcx] mov QWORD PTR[8+rdi],r9 sbb r15,QWORD PTR[56+rdx] mov r9,QWORD PTR[8+rcx] mov QWORD PTR[16+rdi],r10 sbb rax,QWORD PTR[64+rdx] mov r10,QWORD PTR[16+rcx] mov QWORD PTR[24+rdi],r11 sbb rbx,QWORD PTR[72+rdx] mov r11,QWORD PTR[24+rcx] mov QWORD PTR[32+rdi],r12 sbb rbp,QWORD PTR[80+rdx] mov r12,QWORD PTR[32+rcx] mov QWORD PTR[40+rdi],r13 sbb rsi,QWORD PTR[88+rdx] mov r13,QWORD PTR[40+rcx] sbb rdx,rdx and r8,rdx and r9,rdx and r10,rdx and r11,rdx and r12,rdx and r13,rdx add r14,r8 adc r15,r9 mov QWORD PTR[48+rdi],r14 adc rax,r10 mov QWORD PTR[56+rdi],r15 adc rbx,r11 mov QWORD PTR[64+rdi],rax adc rbp,r12 mov QWORD PTR[72+rdi],rbx adc rsi,r13 mov QWORD PTR[80+rdi],rbp mov QWORD PTR[88+rdi],rsi ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __subx_mod_384x384 ENDP ALIGN 32 __addx_mod_384 PROC PRIVATE DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] add r8,QWORD PTR[rdx] adc r9,QWORD PTR[8+rdx] adc r10,QWORD PTR[16+rdx] mov r14,r8 adc r11,QWORD PTR[24+rdx] mov r15,r9 adc r12,QWORD PTR[32+rdx] mov rax,r10 adc r13,QWORD PTR[40+rdx] mov rbx,r11 sbb rdx,rdx sub r8,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rcx] mov rbp,r12 sbb r10,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rcx] mov rsi,r13 sbb r13,QWORD PTR[40+rcx] sbb rdx,0 cmovc r8,r14 cmovc r9,r15 cmovc r10,rax mov QWORD PTR[rdi],r8 cmovc r11,rbx mov QWORD PTR[8+rdi],r9 cmovc r12,rbp mov QWORD PTR[16+rdi],r10 cmovc r13,rsi mov QWORD PTR[24+rdi],r11 mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __addx_mod_384 ENDP ALIGN 32 __subx_mod_384 PROC PRIVATE DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] __subx_mod_384_a_is_loaded:: sub r8,QWORD PTR[rdx] mov r14,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rdx] mov r15,QWORD PTR[8+rcx] sbb r10,QWORD PTR[16+rdx] mov rax,QWORD PTR[16+rcx] sbb r11,QWORD PTR[24+rdx] mov rbx,QWORD PTR[24+rcx] sbb r12,QWORD PTR[32+rdx] mov rbp,QWORD PTR[32+rcx] sbb r13,QWORD PTR[40+rdx] mov rsi,QWORD PTR[40+rcx] sbb rdx,rdx and r14,rdx and r15,rdx and rax,rdx and rbx,rdx and rbp,rdx and rsi,rdx add r8,r14 adc r9,r15 mov QWORD PTR[rdi],r8 adc r10,rax mov QWORD PTR[8+rdi],r9 adc r11,rbx mov QWORD PTR[16+rdi],r10 adc r12,rbp mov QWORD PTR[24+rdi],r11 adc r13,rsi mov QWORD PTR[32+rdi],r12 mov QWORD PTR[40+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __subx_mod_384 ENDP PUBLIC mulx_mont_384x ALIGN 32 mulx_mont_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_mont_384x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mul_mont_384x$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,328 $L$SEH_body_mulx_mont_384x:: mov rbx,rdx mov QWORD PTR[32+rsp],rdi mov QWORD PTR[24+rsp],rsi mov QWORD PTR[16+rsp],rdx mov QWORD PTR[8+rsp],rcx mov QWORD PTR[rsp],r8 lea rdi,QWORD PTR[40+rsp] ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_384 lea rbx,QWORD PTR[48+rbx] lea rsi,QWORD PTR[((128+48))+rsi] lea rdi,QWORD PTR[96+rdi] call __mulx_384 mov rcx,QWORD PTR[8+rsp] lea rsi,QWORD PTR[rbx] lea rdx,QWORD PTR[((-48))+rbx] lea rdi,QWORD PTR[((40+192+48))+rsp] ifdef __SGX_LVI_HARDENING__ lfence endif call __addx_mod_384 mov rsi,QWORD PTR[24+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((-48))+rdi] ifdef __SGX_LVI_HARDENING__ lfence endif call __addx_mod_384 lea rbx,QWORD PTR[rdi] lea rsi,QWORD PTR[48+rdi] call __mulx_384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[40+rsp] mov rcx,QWORD PTR[8+rsp] ifdef __SGX_LVI_HARDENING__ lfence endif call __subx_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] call __subx_mod_384x384 lea rsi,QWORD PTR[40+rsp] lea rdx,QWORD PTR[((40+96))+rsp] lea rdi,QWORD PTR[40+rsp] call __subx_mod_384x384 lea rbx,QWORD PTR[rcx] lea rsi,QWORD PTR[40+rsp] mov rcx,QWORD PTR[rsp] mov rdi,QWORD PTR[32+rsp] call __mulx_by_1_mont_384 call __redx_tail_mont_384 lea rsi,QWORD PTR[((40+192))+rsp] mov rcx,QWORD PTR[rsp] lea rdi,QWORD PTR[48+rdi] call __mulx_by_1_mont_384 call __redx_tail_mont_384 lea r8,QWORD PTR[328+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_mulx_mont_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mulx_mont_384x:: mulx_mont_384x ENDP PUBLIC sqrx_mont_384x ALIGN 32 sqrx_mont_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_384x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 sqr_mont_384x$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,136 $L$SEH_body_sqrx_mont_384x:: mov QWORD PTR[rsp],rcx mov rcx,rdx mov QWORD PTR[16+rsp],rdi mov QWORD PTR[24+rsp],rsi lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[32+rsp] call __addx_mod_384 mov rsi,QWORD PTR[24+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((32+48))+rsp] call __subx_mod_384 mov rsi,QWORD PTR[24+rsp] lea rbx,QWORD PTR[48+rsi] ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[48+rsi] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov r12,QWORD PTR[24+rsi] mov rdi,QWORD PTR[32+rsi] mov rbp,QWORD PTR[40+rsi] lea rsi,QWORD PTR[((-128))+rsi] lea rcx,QWORD PTR[((-128))+rcx] mulx r9,r8,r14 call __mulx_mont_384 add rdx,rdx adc r15,r15 adc rax,rax mov r8,rdx adc r12,r12 mov r9,r15 adc rdi,rdi mov r10,rax adc rbp,rbp mov r11,r12 sbb rsi,rsi sub rdx,QWORD PTR[rcx] sbb r15,QWORD PTR[8+rcx] mov r13,rdi sbb rax,QWORD PTR[16+rcx] sbb r12,QWORD PTR[24+rcx] sbb rdi,QWORD PTR[32+rcx] mov r14,rbp sbb rbp,QWORD PTR[40+rcx] sbb rsi,0 cmovc rdx,r8 cmovc r15,r9 cmovc rax,r10 mov QWORD PTR[48+rbx],rdx cmovc r12,r11 mov QWORD PTR[56+rbx],r15 cmovc rdi,r13 mov QWORD PTR[64+rbx],rax cmovc rbp,r14 mov QWORD PTR[72+rbx],r12 mov QWORD PTR[80+rbx],rdi mov QWORD PTR[88+rbx],rbp lea rsi,QWORD PTR[32+rsp] lea rbx,QWORD PTR[((32+48))+rsp] mov rdx,QWORD PTR[((32+48))+rsp] mov r14,QWORD PTR[((32+0))+rsp] mov r15,QWORD PTR[((32+8))+rsp] mov rax,QWORD PTR[((32+16))+rsp] mov r12,QWORD PTR[((32+24))+rsp] mov rdi,QWORD PTR[((32+32))+rsp] mov rbp,QWORD PTR[((32+40))+rsp] lea rsi,QWORD PTR[((-128))+rsi] lea rcx,QWORD PTR[((-128))+rcx] mulx r9,r8,r14 call __mulx_mont_384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_sqrx_mont_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_mont_384x:: sqrx_mont_384x ENDP PUBLIC mulx_382x ALIGN 32 mulx_382x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_382x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mul_382x$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,136 $L$SEH_body_mulx_382x:: lea rdi,QWORD PTR[96+rdi] mov QWORD PTR[rsp],rsi mov QWORD PTR[8+rsp],rdx mov QWORD PTR[16+rsp],rdi mov QWORD PTR[24+rsp],rcx ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] add r8,QWORD PTR[48+rsi] adc r9,QWORD PTR[56+rsi] adc r10,QWORD PTR[64+rsi] adc r11,QWORD PTR[72+rsi] adc r12,QWORD PTR[80+rsi] adc r13,QWORD PTR[88+rsi] mov QWORD PTR[((32+0))+rsp],r8 mov QWORD PTR[((32+8))+rsp],r9 mov QWORD PTR[((32+16))+rsp],r10 mov QWORD PTR[((32+24))+rsp],r11 mov QWORD PTR[((32+32))+rsp],r12 mov QWORD PTR[((32+40))+rsp],r13 mov r8,QWORD PTR[rdx] mov r9,QWORD PTR[8+rdx] mov r10,QWORD PTR[16+rdx] mov r11,QWORD PTR[24+rdx] mov r12,QWORD PTR[32+rdx] mov r13,QWORD PTR[40+rdx] add r8,QWORD PTR[48+rdx] adc r9,QWORD PTR[56+rdx] adc r10,QWORD PTR[64+rdx] adc r11,QWORD PTR[72+rdx] adc r12,QWORD PTR[80+rdx] adc r13,QWORD PTR[88+rdx] mov QWORD PTR[((32+48))+rsp],r8 mov QWORD PTR[((32+56))+rsp],r9 mov QWORD PTR[((32+64))+rsp],r10 mov QWORD PTR[((32+72))+rsp],r11 mov QWORD PTR[((32+80))+rsp],r12 mov QWORD PTR[((32+88))+rsp],r13 lea rsi,QWORD PTR[((32+0))+rsp] lea rbx,QWORD PTR[((32+48))+rsp] call __mulx_384 mov rsi,QWORD PTR[rsp] mov rbx,QWORD PTR[8+rsp] lea rdi,QWORD PTR[((-96))+rdi] ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_384 lea rsi,QWORD PTR[((48+128))+rsi] lea rbx,QWORD PTR[48+rbx] lea rdi,QWORD PTR[32+rsp] call __mulx_384 mov rsi,QWORD PTR[16+rsp] lea rdx,QWORD PTR[32+rsp] mov rcx,QWORD PTR[24+rsp] mov rdi,rsi ifdef __SGX_LVI_HARDENING__ lfence endif call __subx_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] call __subx_mod_384x384 lea rsi,QWORD PTR[((-96))+rdi] lea rdx,QWORD PTR[32+rsp] lea rdi,QWORD PTR[((-96))+rdi] call __subx_mod_384x384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_mulx_382x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mulx_382x:: mulx_382x ENDP PUBLIC sqrx_382x ALIGN 32 sqrx_382x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_382x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 sqr_382x$1:: push rbp push rbx push r12 push r13 push r14 push r15 push rsi $L$SEH_body_sqrx_382x:: mov rcx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov rbx,QWORD PTR[24+rsi] mov rbp,QWORD PTR[32+rsi] mov rdx,QWORD PTR[40+rsi] mov r8,r14 add r14,QWORD PTR[48+rsi] mov r9,r15 adc r15,QWORD PTR[56+rsi] mov r10,rax adc rax,QWORD PTR[64+rsi] mov r11,rbx adc rbx,QWORD PTR[72+rsi] mov r12,rbp adc rbp,QWORD PTR[80+rsi] mov r13,rdx adc rdx,QWORD PTR[88+rsi] mov QWORD PTR[rdi],r14 mov QWORD PTR[8+rdi],r15 mov QWORD PTR[16+rdi],rax mov QWORD PTR[24+rdi],rbx mov QWORD PTR[32+rdi],rbp mov QWORD PTR[40+rdi],rdx lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[48+rdi] call __subx_mod_384_a_is_loaded lea rsi,QWORD PTR[rdi] lea rbx,QWORD PTR[((-48))+rdi] lea rdi,QWORD PTR[((-48))+rdi] call __mulx_384 mov rsi,QWORD PTR[rsp] lea rbx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[96+rdi] ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_384 mov r8,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] mov r10,QWORD PTR[16+rdi] mov r11,QWORD PTR[24+rdi] mov r12,QWORD PTR[32+rdi] mov r13,QWORD PTR[40+rdi] mov r14,QWORD PTR[48+rdi] mov r15,QWORD PTR[56+rdi] mov rax,QWORD PTR[64+rdi] mov rbx,QWORD PTR[72+rdi] mov rbp,QWORD PTR[80+rdi] add r8,r8 mov rdx,QWORD PTR[88+rdi] adc r9,r9 mov QWORD PTR[rdi],r8 adc r10,r10 mov QWORD PTR[8+rdi],r9 adc r11,r11 mov QWORD PTR[16+rdi],r10 adc r12,r12 mov QWORD PTR[24+rdi],r11 adc r13,r13 mov QWORD PTR[32+rdi],r12 adc r14,r14 mov QWORD PTR[40+rdi],r13 adc r15,r15 mov QWORD PTR[48+rdi],r14 adc rax,rax mov QWORD PTR[56+rdi],r15 adc rbx,rbx mov QWORD PTR[64+rdi],rax adc rbp,rbp mov QWORD PTR[72+rdi],rbx adc rdx,rdx mov QWORD PTR[80+rdi],rbp mov QWORD PTR[88+rdi],rdx mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sqrx_382x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_382x:: sqrx_382x ENDP PUBLIC mulx_384 ALIGN 32 mulx_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mul_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 $L$SEH_body_mulx_384:: mov rbx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_384 mov r15,QWORD PTR[rsp] mov r14,QWORD PTR[8+rsp] mov r13,QWORD PTR[16+rsp] mov r12,QWORD PTR[24+rsp] mov rbx,QWORD PTR[32+rsp] mov rbp,QWORD PTR[40+rsp] lea rsp,QWORD PTR[48+rsp] $L$SEH_epilogue_mulx_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mulx_384:: mulx_384 ENDP ALIGN 32 __mulx_384 PROC PRIVATE DB 243,15,30,250 mov rdx,QWORD PTR[rbx] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] lea rsi,QWORD PTR[((-128))+rsi] mulx rcx,r9,r14 xor rbp,rbp mulx rax,r8,r15 adcx r8,rcx mov QWORD PTR[rdi],r9 mulx rcx,r9,r10 adcx r9,rax mulx rax,r10,r11 adcx r10,rcx mulx rcx,r11,r12 adcx r11,rax mulx r13,r12,r13 mov rdx,QWORD PTR[8+rbx] adcx r12,rcx adcx r13,rbp mulx rcx,rax,r14 adcx rax,r8 adox r9,rcx mov QWORD PTR[8+rdi],rax mulx rcx,r8,r15 adcx r8,r9 adox r10,rcx mulx rax,r9,QWORD PTR[((128+16))+rsi] adcx r9,r10 adox r11,rax mulx rcx,r10,QWORD PTR[((128+24))+rsi] adcx r10,r11 adox r12,rcx mulx rax,r11,QWORD PTR[((128+32))+rsi] adcx r11,r12 adox rax,r13 mulx r13,r12,QWORD PTR[((128+40))+rsi] mov rdx,QWORD PTR[16+rbx] adcx r12,rax adox r13,rbp adcx r13,rbp mulx rcx,rax,r14 adcx rax,r8 adox r9,rcx mov QWORD PTR[16+rdi],rax mulx rcx,r8,r15 adcx r8,r9 adox r10,rcx mulx rax,r9,QWORD PTR[((128+16))+rsi] adcx r9,r10 adox r11,rax mulx rcx,r10,QWORD PTR[((128+24))+rsi] adcx r10,r11 adox r12,rcx mulx rax,r11,QWORD PTR[((128+32))+rsi] adcx r11,r12 adox rax,r13 mulx r13,r12,QWORD PTR[((128+40))+rsi] mov rdx,QWORD PTR[24+rbx] adcx r12,rax adox r13,rbp adcx r13,rbp mulx rcx,rax,r14 adcx rax,r8 adox r9,rcx mov QWORD PTR[24+rdi],rax mulx rcx,r8,r15 adcx r8,r9 adox r10,rcx mulx rax,r9,QWORD PTR[((128+16))+rsi] adcx r9,r10 adox r11,rax mulx rcx,r10,QWORD PTR[((128+24))+rsi] adcx r10,r11 adox r12,rcx mulx rax,r11,QWORD PTR[((128+32))+rsi] adcx r11,r12 adox rax,r13 mulx r13,r12,QWORD PTR[((128+40))+rsi] mov rdx,QWORD PTR[32+rbx] adcx r12,rax adox r13,rbp adcx r13,rbp mulx rcx,rax,r14 adcx rax,r8 adox r9,rcx mov QWORD PTR[32+rdi],rax mulx rcx,r8,r15 adcx r8,r9 adox r10,rcx mulx rax,r9,QWORD PTR[((128+16))+rsi] adcx r9,r10 adox r11,rax mulx rcx,r10,QWORD PTR[((128+24))+rsi] adcx r10,r11 adox r12,rcx mulx rax,r11,QWORD PTR[((128+32))+rsi] adcx r11,r12 adox rax,r13 mulx r13,r12,QWORD PTR[((128+40))+rsi] mov rdx,QWORD PTR[40+rbx] adcx r12,rax adox r13,rbp adcx r13,rbp mulx rcx,rax,r14 adcx rax,r8 adox r9,rcx mov QWORD PTR[40+rdi],rax mulx rcx,r8,r15 adcx r8,r9 adox r10,rcx mulx rax,r9,QWORD PTR[((128+16))+rsi] adcx r9,r10 adox r11,rax mulx rcx,r10,QWORD PTR[((128+24))+rsi] adcx r10,r11 adox r12,rcx mulx rax,r11,QWORD PTR[((128+32))+rsi] adcx r11,r12 adox rax,r13 mulx r13,r12,QWORD PTR[((128+40))+rsi] mov rdx,rax adcx r12,rax adox r13,rbp adcx r13,rbp mov QWORD PTR[48+rdi],r8 mov QWORD PTR[56+rdi],r9 mov QWORD PTR[64+rdi],r10 mov QWORD PTR[72+rdi],r11 mov QWORD PTR[80+rdi],r12 mov QWORD PTR[88+rdi],r13 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulx_384 ENDP PUBLIC sqrx_384 ALIGN 32 sqrx_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_384:: mov rdi,rcx mov rsi,rdx sqr_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 push rdi $L$SEH_body_sqrx_384:: ifdef __SGX_LVI_HARDENING__ lfence endif call __sqrx_384 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sqrx_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_384:: sqrx_384 ENDP ALIGN 32 __sqrx_384 PROC PRIVATE DB 243,15,30,250 mov rdx,QWORD PTR[rsi] mov r14,QWORD PTR[8+rsi] mov r15,QWORD PTR[16+rsi] mov rcx,QWORD PTR[24+rsi] mov rbx,QWORD PTR[32+rsi] mulx rdi,r8,r14 mov rbp,QWORD PTR[40+rsi] mulx rax,r9,r15 add r9,rdi mulx rdi,r10,rcx adc r10,rax mulx rax,r11,rbx adc r11,rdi mulx r13,r12,rbp mov rdx,r14 adc r12,rax adc r13,0 xor r14,r14 mulx rax,rdi,r15 adcx r10,rdi adox r11,rax mulx rax,rdi,rcx adcx r11,rdi adox r12,rax mulx rax,rdi,rbx adcx r12,rdi adox r13,rax mulx rax,rdi,rbp mov rdx,r15 adcx r13,rdi adox rax,r14 adcx r14,rax xor r15,r15 mulx rax,rdi,rcx adcx r12,rdi adox r13,rax mulx rax,rdi,rbx adcx r13,rdi adox r14,rax mulx rax,rdi,rbp mov rdx,rcx adcx r14,rdi adox rax,r15 adcx r15,rax xor rcx,rcx mulx rax,rdi,rbx adcx r14,rdi adox r15,rax mulx rax,rdi,rbp mov rdx,rbx adcx r15,rdi adox rax,rcx adcx rcx,rax mulx rbx,rdi,rbp mov rdx,QWORD PTR[rsi] add rcx,rdi mov rdi,QWORD PTR[8+rsp] adc rbx,0 xor rbp,rbp adcx r8,r8 adcx r9,r9 adcx r10,r10 adcx r11,r11 adcx r12,r12 mulx rax,rdx,rdx mov QWORD PTR[rdi],rdx mov rdx,QWORD PTR[8+rsi] adox r8,rax mov QWORD PTR[8+rdi],r8 mulx rax,r8,rdx mov rdx,QWORD PTR[16+rsi] adox r9,r8 adox r10,rax mov QWORD PTR[16+rdi],r9 mov QWORD PTR[24+rdi],r10 mulx r9,r8,rdx mov rdx,QWORD PTR[24+rsi] adox r11,r8 adox r12,r9 adcx r13,r13 adcx r14,r14 mov QWORD PTR[32+rdi],r11 mov QWORD PTR[40+rdi],r12 mulx r9,r8,rdx mov rdx,QWORD PTR[32+rsi] adox r13,r8 adox r14,r9 adcx r15,r15 adcx rcx,rcx mov QWORD PTR[48+rdi],r13 mov QWORD PTR[56+rdi],r14 mulx r9,r8,rdx mov rdx,QWORD PTR[40+rsi] adox r15,r8 adox rcx,r9 adcx rbx,rbx adcx rbp,rbp mov QWORD PTR[64+rdi],r15 mov QWORD PTR[72+rdi],rcx mulx r9,r8,rdx adox rbx,r8 adox rbp,r9 mov QWORD PTR[80+rdi],rbx mov QWORD PTR[88+rdi],rbp ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __sqrx_384 ENDP PUBLIC redcx_mont_384 ALIGN 32 redcx_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redcx_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 redc_mont_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_redcx_mont_384:: mov rbx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_by_1_mont_384 call __redx_tail_mont_384 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_redcx_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_redcx_mont_384:: redcx_mont_384 ENDP PUBLIC fromx_mont_384 ALIGN 32 fromx_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_fromx_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 from_mont_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_fromx_mont_384:: mov rbx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_by_1_mont_384 mov rax,r14 mov rcx,r15 mov rdx,r8 mov rbp,r9 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] mov r13,r10 sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] mov rsi,r11 sbb r11,QWORD PTR[40+rbx] cmovc r14,rax cmovc r15,rcx cmovc r8,rdx mov QWORD PTR[rdi],r14 cmovc r9,rbp mov QWORD PTR[8+rdi],r15 cmovc r10,r13 mov QWORD PTR[16+rdi],r8 cmovc r11,rsi mov QWORD PTR[24+rdi],r9 mov QWORD PTR[32+rdi],r10 mov QWORD PTR[40+rdi],r11 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_fromx_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_fromx_mont_384:: fromx_mont_384 ENDP ALIGN 32 __mulx_by_1_mont_384 PROC PRIVATE DB 243,15,30,250 mov r8,QWORD PTR[rsi] mov rdx,rcx mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] imul rdx,r8 xor r14,r14 mulx rbp,rax,QWORD PTR[rbx] adcx r8,rax adox r9,rbp mulx rbp,rax,QWORD PTR[8+rbx] adcx r9,rax adox r10,rbp mulx rbp,rax,QWORD PTR[16+rbx] adcx r10,rax adox r11,rbp mulx rbp,rax,QWORD PTR[24+rbx] adcx r11,rax adox r12,rbp mulx rbp,rax,QWORD PTR[32+rbx] adcx r12,rax adox r13,rbp mulx rbp,rax,QWORD PTR[40+rbx] mov rdx,rcx adcx r13,rax adox rbp,r14 adcx r14,rbp imul rdx,r9 xor r15,r15 mulx rbp,rax,QWORD PTR[rbx] adcx r9,rax adox r10,rbp mulx rbp,rax,QWORD PTR[8+rbx] adcx r10,rax adox r11,rbp mulx rbp,rax,QWORD PTR[16+rbx] adcx r11,rax adox r12,rbp mulx rbp,rax,QWORD PTR[24+rbx] adcx r12,rax adox r13,rbp mulx rbp,rax,QWORD PTR[32+rbx] adcx r13,rax adox r14,rbp mulx rbp,rax,QWORD PTR[40+rbx] mov rdx,rcx adcx r14,rax adox rbp,r15 adcx r15,rbp imul rdx,r10 xor r8,r8 mulx rbp,rax,QWORD PTR[rbx] adcx r10,rax adox r11,rbp mulx rbp,rax,QWORD PTR[8+rbx] adcx r11,rax adox r12,rbp mulx rbp,rax,QWORD PTR[16+rbx] adcx r12,rax adox r13,rbp mulx rbp,rax,QWORD PTR[24+rbx] adcx r13,rax adox r14,rbp mulx rbp,rax,QWORD PTR[32+rbx] adcx r14,rax adox r15,rbp mulx rbp,rax,QWORD PTR[40+rbx] mov rdx,rcx adcx r15,rax adox rbp,r8 adcx r8,rbp imul rdx,r11 xor r9,r9 mulx rbp,rax,QWORD PTR[rbx] adcx r11,rax adox r12,rbp mulx rbp,rax,QWORD PTR[8+rbx] adcx r12,rax adox r13,rbp mulx rbp,rax,QWORD PTR[16+rbx] adcx r13,rax adox r14,rbp mulx rbp,rax,QWORD PTR[24+rbx] adcx r14,rax adox r15,rbp mulx rbp,rax,QWORD PTR[32+rbx] adcx r15,rax adox r8,rbp mulx rbp,rax,QWORD PTR[40+rbx] mov rdx,rcx adcx r8,rax adox rbp,r9 adcx r9,rbp imul rdx,r12 xor r10,r10 mulx rbp,rax,QWORD PTR[rbx] adcx r12,rax adox r13,rbp mulx rbp,rax,QWORD PTR[8+rbx] adcx r13,rax adox r14,rbp mulx rbp,rax,QWORD PTR[16+rbx] adcx r14,rax adox r15,rbp mulx rbp,rax,QWORD PTR[24+rbx] adcx r15,rax adox r8,rbp mulx rbp,rax,QWORD PTR[32+rbx] adcx r8,rax adox r9,rbp mulx rbp,rax,QWORD PTR[40+rbx] mov rdx,rcx adcx r9,rax adox rbp,r10 adcx r10,rbp imul rdx,r13 xor r11,r11 mulx rbp,rax,QWORD PTR[rbx] adcx r13,rax adox r14,rbp mulx rbp,rax,QWORD PTR[8+rbx] adcx r14,rax adox r15,rbp mulx rbp,rax,QWORD PTR[16+rbx] adcx r15,rax adox r8,rbp mulx rbp,rax,QWORD PTR[24+rbx] adcx r8,rax adox r9,rbp mulx rbp,rax,QWORD PTR[32+rbx] adcx r9,rax adox r10,rbp mulx rbp,rax,QWORD PTR[40+rbx] mov rdx,rcx adcx r10,rax adox rbp,r11 adcx r11,rbp ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __mulx_by_1_mont_384 ENDP ALIGN 32 __redx_tail_mont_384 PROC PRIVATE DB 243,15,30,250 add r14,QWORD PTR[48+rsi] mov rax,r14 adc r15,QWORD PTR[56+rsi] adc r8,QWORD PTR[64+rsi] adc r9,QWORD PTR[72+rsi] mov rcx,r15 adc r10,QWORD PTR[80+rsi] adc r11,QWORD PTR[88+rsi] sbb r12,r12 mov rdx,r8 mov rbp,r9 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] mov r13,r10 sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] mov rsi,r11 sbb r11,QWORD PTR[40+rbx] sbb r12,0 cmovc r14,rax cmovc r15,rcx cmovc r8,rdx mov QWORD PTR[rdi],r14 cmovc r9,rbp mov QWORD PTR[8+rdi],r15 cmovc r10,r13 mov QWORD PTR[16+rdi],r8 cmovc r11,rsi mov QWORD PTR[24+rdi],r9 mov QWORD PTR[32+rdi],r10 mov QWORD PTR[40+rdi],r11 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif __redx_tail_mont_384 ENDP PUBLIC sgn0x_pty_mont_384 ALIGN 32 sgn0x_pty_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0x_pty_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 sgn0_pty_mont_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sgn0x_pty_mont_384:: mov rbx,rsi lea rsi,QWORD PTR[rdi] mov rcx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_by_1_mont_384 xor rax,rax mov r13,r14 add r14,r14 adc r15,r15 adc r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 adc rax,0 sub r14,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] sbb r11,QWORD PTR[40+rbx] sbb rax,0 not rax and r13,1 and rax,2 or rax,r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sgn0x_pty_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sgn0x_pty_mont_384:: sgn0x_pty_mont_384 ENDP PUBLIC sgn0x_pty_mont_384x ALIGN 32 sgn0x_pty_mont_384x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0x_pty_mont_384x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 sgn0_pty_mont_384x$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,8 $L$SEH_body_sgn0x_pty_mont_384x:: mov rbx,rsi lea rsi,QWORD PTR[48+rdi] mov rcx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif call __mulx_by_1_mont_384 mov r12,r14 or r14,r15 or r14,r8 or r14,r9 or r14,r10 or r14,r11 lea rsi,QWORD PTR[rdi] xor rdi,rdi mov r13,r12 add r12,r12 adc r15,r15 adc r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 adc rdi,0 sub r12,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] sbb r11,QWORD PTR[40+rbx] sbb rdi,0 mov QWORD PTR[rsp],r14 not rdi and r13,1 and rdi,2 or rdi,r13 call __mulx_by_1_mont_384 mov r12,r14 or r14,r15 or r14,r8 or r14,r9 or r14,r10 or r14,r11 xor rax,rax mov r13,r12 add r12,r12 adc r15,r15 adc r8,r8 adc r9,r9 adc r10,r10 adc r11,r11 adc rax,0 sub r12,QWORD PTR[rbx] sbb r15,QWORD PTR[8+rbx] sbb r8,QWORD PTR[16+rbx] sbb r9,QWORD PTR[24+rbx] sbb r10,QWORD PTR[32+rbx] sbb r11,QWORD PTR[40+rbx] sbb rax,0 mov r12,QWORD PTR[rsp] not rax test r14,r14 cmovz r13,rdi test r12,r12 cmovnz rax,rdi and r13,1 and rax,2 or rax,r13 mov r15,QWORD PTR[8+rsp] mov r14,QWORD PTR[16+rsp] mov r13,QWORD PTR[24+rsp] mov r12,QWORD PTR[32+rsp] mov rbx,QWORD PTR[40+rsp] mov rbp,QWORD PTR[48+rsp] lea rsp,QWORD PTR[56+rsp] $L$SEH_epilogue_sgn0x_pty_mont_384x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sgn0x_pty_mont_384x:: sgn0x_pty_mont_384x ENDP PUBLIC mulx_mont_384 ALIGN 32 mulx_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mul_mont_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 lea rsp,QWORD PTR[((-24))+rsp] $L$SEH_body_mulx_mont_384:: mov rbx,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[rdx] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov r12,QWORD PTR[24+rsi] mov QWORD PTR[16+rsp],rdi mov rdi,QWORD PTR[32+rsi] mov rbp,QWORD PTR[40+rsi] lea rsi,QWORD PTR[((-128))+rsi] lea rcx,QWORD PTR[((-128))+rcx] mov QWORD PTR[rsp],r8 mulx r9,r8,r14 call __mulx_mont_384 mov r15,QWORD PTR[24+rsp] mov r14,QWORD PTR[32+rsp] mov r13,QWORD PTR[40+rsp] mov r12,QWORD PTR[48+rsp] mov rbx,QWORD PTR[56+rsp] mov rbp,QWORD PTR[64+rsp] lea rsp,QWORD PTR[72+rsp] $L$SEH_epilogue_mulx_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_mulx_mont_384:: mulx_mont_384 ENDP ALIGN 32 __mulx_mont_384 PROC PRIVATE DB 243,15,30,250 mulx r10,r14,r15 mulx r11,r15,rax add r9,r14 mulx r12,rax,r12 adc r10,r15 mulx r13,rdi,rdi adc r11,rax mulx r14,rbp,rbp mov rdx,QWORD PTR[8+rbx] adc r12,rdi adc r13,rbp adc r14,0 xor r15,r15 mov QWORD PTR[16+rsp],r8 imul r8,QWORD PTR[8+rsp] xor rax,rax mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r9,rdi adcx r10,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r10,rdi adcx r11,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r11,rdi adcx r12,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r8 adox r14,rdi adcx r15,rbp adox r15,rax adox rax,rax xor r8,r8 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx rdi,QWORD PTR[16+rsp] adox r9,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r9,rdi adox r10,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r10,rdi adox r11,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r11,rdi adox r12,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[16+rbx] adcx r13,rdi adox r14,rbp adcx r14,r8 adox r15,r8 adcx r15,r8 adox rax,r8 adcx rax,r8 mov QWORD PTR[16+rsp],r9 imul r9,QWORD PTR[8+rsp] xor r8,r8 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r10,rdi adcx r11,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r11,rdi adcx r12,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r9 adox r15,rdi adcx rax,rbp adox rax,r8 adox r8,r8 xor r9,r9 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx rdi,QWORD PTR[16+rsp] adox r10,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r10,rdi adox r11,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r11,rdi adox r12,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[24+rbx] adcx r14,rdi adox r15,rbp adcx r15,r9 adox rax,r9 adcx rax,r9 adox r8,r9 adcx r8,r9 mov QWORD PTR[16+rsp],r10 imul r10,QWORD PTR[8+rsp] xor r9,r9 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r11,rdi adcx r12,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r15,rdi adcx rax,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r10 adox rax,rdi adcx r8,rbp adox r8,r9 adox r9,r9 xor r10,r10 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx rdi,QWORD PTR[16+rsp] adox r11,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r11,rdi adox r12,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[32+rbx] adcx r15,rdi adox rax,rbp adcx rax,r10 adox r8,r10 adcx r8,r10 adox r9,r10 adcx r9,r10 mov QWORD PTR[16+rsp],r11 imul r11,QWORD PTR[8+rsp] xor r10,r10 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r15,rdi adcx rax,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox rax,rdi adcx r8,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r11 adox r8,rdi adcx r9,rbp adox r9,r10 adox r10,r10 xor r11,r11 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx rdi,QWORD PTR[16+rsp] adox r12,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r15,rdi adox rax,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[40+rbx] adcx rax,rdi adox r8,rbp adcx r8,r11 adox r9,r11 adcx r9,r11 adox r10,r11 adcx r10,r11 mov QWORD PTR[16+rsp],r12 imul r12,QWORD PTR[8+rsp] xor r11,r11 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r15,rdi adcx rax,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox rax,rdi adcx r8,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r8,rdi adcx r9,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r12 adox r9,rdi adcx r10,rbp adox r10,r11 adox r11,r11 xor r12,r12 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx rdi,QWORD PTR[16+rsp] adox r13,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r15,rdi adox rax,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx rax,rdi adox r8,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,r13 adcx r8,rdi adox r9,rbp adcx r9,r12 adox r10,r12 adcx r10,r12 adox r11,r12 adcx r11,r12 imul rdx,QWORD PTR[8+rsp] mov rbx,QWORD PTR[24+rsp] xor r12,r12 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r15,rdi adox rax,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx rax,rdi adox r8,rbp mov r13,r15 mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r8,rdi adox r9,rbp mov rsi,rax mulx rbp,rdi,QWORD PTR[((40+128))+rcx] adcx r9,rdi adox r10,rbp mov rdx,r14 adcx r10,r12 adox r11,r12 lea rcx,QWORD PTR[128+rcx] mov r12,r8 adc r11,0 sub r14,QWORD PTR[rcx] sbb r15,QWORD PTR[8+rcx] mov rdi,r9 sbb rax,QWORD PTR[16+rcx] sbb r8,QWORD PTR[24+rcx] sbb r9,QWORD PTR[32+rcx] mov rbp,r10 sbb r10,QWORD PTR[40+rcx] sbb r11,0 cmovnc rdx,r14 cmovc r15,r13 cmovc rax,rsi cmovnc r12,r8 mov QWORD PTR[rbx],rdx cmovnc rdi,r9 mov QWORD PTR[8+rbx],r15 cmovnc rbp,r10 mov QWORD PTR[16+rbx],rax mov QWORD PTR[24+rbx],r12 mov QWORD PTR[32+rbx],rdi mov QWORD PTR[40+rbx],rbp ifdef __SGX_LVI_HARDENING__ pop rsi lfence jmp rsi ud2 else DB 0F3h,0C3h endif __mulx_mont_384 ENDP PUBLIC sqrx_mont_384 ALIGN 32 sqrx_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 sqr_mont_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 lea rsp,QWORD PTR[((-24))+rsp] $L$SEH_body_sqrx_mont_384:: mov r8,rcx lea rcx,QWORD PTR[((-128))+rdx] ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov r12,QWORD PTR[24+rsi] mov QWORD PTR[16+rsp],rdi mov rdi,QWORD PTR[32+rsi] mov rbp,QWORD PTR[40+rsi] lea rbx,QWORD PTR[rsi] mov QWORD PTR[rsp],r8 lea rsi,QWORD PTR[((-128))+rsi] mulx r9,r8,rdx call __mulx_mont_384 mov r15,QWORD PTR[24+rsp] mov r14,QWORD PTR[32+rsp] mov r13,QWORD PTR[40+rsp] mov r12,QWORD PTR[48+rsp] mov rbx,QWORD PTR[56+rsp] mov rbp,QWORD PTR[64+rsp] lea rsp,QWORD PTR[72+rsp] $L$SEH_epilogue_sqrx_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_mont_384:: sqrx_mont_384 ENDP PUBLIC sqrx_n_mul_mont_384 ALIGN 32 sqrx_n_mul_mont_384 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_n_mul_mont_384:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] sqr_n_mul_mont_384$1:: push rbp push rbx push r12 push r13 push r14 push r15 lea rsp,QWORD PTR[((-40))+rsp] $L$SEH_body_sqrx_n_mul_mont_384:: mov r10,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov rbx,rsi mov r12,QWORD PTR[24+rsi] mov QWORD PTR[16+rsp],rdi mov rdi,QWORD PTR[32+rsi] mov rbp,QWORD PTR[40+rsi] mov QWORD PTR[rsp],r8 mov QWORD PTR[24+rsp],r9 movq xmm2,QWORD PTR[r9] $L$oop_sqrx_384:: movd xmm1,r10d lea rsi,QWORD PTR[((-128))+rbx] lea rcx,QWORD PTR[((-128))+rcx] mulx r9,r8,rdx call __mulx_mont_384 movd r10d,xmm1 dec r10d jnz $L$oop_sqrx_384 mov r14,rdx DB 102,72,15,126,210 lea rsi,QWORD PTR[((-128))+rbx] mov rbx,QWORD PTR[24+rsp] lea rcx,QWORD PTR[((-128))+rcx] mulx r9,r8,r14 call __mulx_mont_384 mov r15,QWORD PTR[40+rsp] mov r14,QWORD PTR[48+rsp] mov r13,QWORD PTR[56+rsp] mov r12,QWORD PTR[64+rsp] mov rbx,QWORD PTR[72+rsp] mov rbp,QWORD PTR[80+rsp] lea rsp,QWORD PTR[88+rsp] $L$SEH_epilogue_sqrx_n_mul_mont_384:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_n_mul_mont_384:: sqrx_n_mul_mont_384 ENDP PUBLIC sqrx_n_mul_mont_383 ALIGN 32 sqrx_n_mul_mont_383 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_n_mul_mont_383:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] sqr_n_mul_mont_383$1:: push rbp push rbx push r12 push r13 push r14 push r15 lea rsp,QWORD PTR[((-40))+rsp] $L$SEH_body_sqrx_n_mul_mont_383:: mov r10,rdx ifdef __SGX_LVI_HARDENING__ lfence endif mov rdx,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov rbx,rsi mov r12,QWORD PTR[24+rsi] mov QWORD PTR[16+rsp],rdi mov rdi,QWORD PTR[32+rsi] mov rbp,QWORD PTR[40+rsi] mov QWORD PTR[rsp],r8 mov QWORD PTR[24+rsp],r9 movq xmm2,QWORD PTR[r9] lea rcx,QWORD PTR[((-128))+rcx] $L$oop_sqrx_383:: movd xmm1,r10d lea rsi,QWORD PTR[((-128))+rbx] mulx r9,r8,rdx call __mulx_mont_383_nonred movd r10d,xmm1 dec r10d jnz $L$oop_sqrx_383 mov r14,rdx DB 102,72,15,126,210 lea rsi,QWORD PTR[((-128))+rbx] mov rbx,QWORD PTR[24+rsp] mulx r9,r8,r14 call __mulx_mont_384 mov r15,QWORD PTR[40+rsp] mov r14,QWORD PTR[48+rsp] mov r13,QWORD PTR[56+rsp] mov r12,QWORD PTR[64+rsp] mov rbx,QWORD PTR[72+rsp] mov rbp,QWORD PTR[80+rsp] lea rsp,QWORD PTR[88+rsp] $L$SEH_epilogue_sqrx_n_mul_mont_383:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_n_mul_mont_383:: sqrx_n_mul_mont_383 ENDP ALIGN 32 __mulx_mont_383_nonred PROC PRIVATE DB 243,15,30,250 mulx r10,r14,r15 mulx r11,r15,rax add r9,r14 mulx r12,rax,r12 adc r10,r15 mulx r13,rdi,rdi adc r11,rax mulx r14,rbp,rbp mov rdx,QWORD PTR[8+rbx] adc r12,rdi adc r13,rbp adc r14,0 mov rax,r8 imul r8,QWORD PTR[8+rsp] xor r15,r15 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r9,rdi adcx r10,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r10,rdi adcx r11,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r11,rdi adcx r12,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r8 adox r14,rdi adcx rbp,r15 adox r15,rbp xor r8,r8 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx rax,rdi adox r9,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r9,rdi adox r10,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r10,rdi adox r11,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r11,rdi adox r12,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[16+rbx] adcx r13,rdi adox r14,rbp adcx r14,rax adox r15,rax adcx r15,rax mov r8,r9 imul r9,QWORD PTR[8+rsp] xor rax,rax mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r10,rdi adcx r11,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r11,rdi adcx r12,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r9 adox r15,rdi adcx rbp,rax adox rax,rbp xor r9,r9 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx r8,rdi adox r10,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r10,rdi adox r11,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r11,rdi adox r12,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[24+rbx] adcx r14,rdi adox r15,rbp adcx r15,r8 adox rax,r8 adcx rax,r8 mov r9,r10 imul r10,QWORD PTR[8+rsp] xor r8,r8 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r11,rdi adcx r12,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r15,rdi adcx rax,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r10 adox rax,rdi adcx rbp,r8 adox r8,rbp xor r10,r10 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx r9,rdi adox r11,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r11,rdi adox r12,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[32+rbx] adcx r15,rdi adox rax,rbp adcx rax,r9 adox r8,r9 adcx r8,r9 mov r10,r11 imul r11,QWORD PTR[8+rsp] xor r9,r9 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r12,rdi adcx r13,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox r15,rdi adcx rax,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox rax,rdi adcx r8,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r11 adox r8,rdi adcx rbp,r9 adox r9,rbp xor r11,r11 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx r10,rdi adox r12,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r12,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r15,rdi adox rax,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,QWORD PTR[40+rbx] adcx rax,rdi adox r8,rbp adcx r8,r10 adox r9,r10 adcx r9,r10 mov r11,r12 imul r12,QWORD PTR[8+rsp] xor r10,r10 mulx rbp,rdi,QWORD PTR[((0+128))+rsi] adox r13,rdi adcx r14,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rsi] adox r14,rdi adcx r15,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rsi] adox r15,rdi adcx rax,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rsi] adox rax,rdi adcx r8,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rsi] adox r8,rdi adcx r9,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rsi] mov rdx,r12 adox r9,rdi adcx rbp,r10 adox r10,rbp xor r12,r12 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx r11,rdi adox r13,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx r15,rdi adox rax,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx rax,rdi adox r8,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,r13 adcx r8,rdi adox r9,rbp adcx r9,r11 adox r10,r11 adcx r10,r11 imul rdx,QWORD PTR[8+rsp] mov rbx,QWORD PTR[24+rsp] xor r12,r12 mulx rbp,rdi,QWORD PTR[((0+128))+rcx] adcx r13,rdi adox r14,rbp mulx rbp,rdi,QWORD PTR[((8+128))+rcx] adcx r14,rdi adox r15,rbp mulx rbp,rdi,QWORD PTR[((16+128))+rcx] adcx r15,rdi adox rax,rbp mulx rbp,rdi,QWORD PTR[((24+128))+rcx] adcx rax,rdi adox r8,rbp mulx rbp,rdi,QWORD PTR[((32+128))+rcx] adcx r8,rdi adox r9,rbp mulx rbp,rdi,QWORD PTR[((40+128))+rcx] mov rdx,r14 adcx r9,rdi adox r10,rbp adc r10,0 mov r12,r8 mov QWORD PTR[rbx],r14 mov QWORD PTR[8+rbx],r15 mov QWORD PTR[16+rbx],rax mov rdi,r9 mov QWORD PTR[24+rbx],r8 mov QWORD PTR[32+rbx],r9 mov QWORD PTR[40+rbx],r10 mov rbp,r10 ifdef __SGX_LVI_HARDENING__ pop rsi lfence jmp rsi ud2 else DB 0F3h,0C3h endif __mulx_mont_383_nonred ENDP PUBLIC sqrx_mont_382x ALIGN 32 sqrx_mont_382x PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_382x:: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 sqr_mont_382x$1:: push rbp push rbx push r12 push r13 push r14 push r15 sub rsp,136 $L$SEH_body_sqrx_mont_382x:: mov QWORD PTR[rsp],rcx mov rcx,rdx mov QWORD PTR[16+rsp],rdi mov QWORD PTR[24+rsp],rsi ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] mov r11,QWORD PTR[24+rsi] mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] mov r14,r8 add r8,QWORD PTR[48+rsi] mov r15,r9 adc r9,QWORD PTR[56+rsi] mov rax,r10 adc r10,QWORD PTR[64+rsi] mov rdx,r11 adc r11,QWORD PTR[72+rsi] mov rbx,r12 adc r12,QWORD PTR[80+rsi] mov rbp,r13 adc r13,QWORD PTR[88+rsi] sub r14,QWORD PTR[48+rsi] sbb r15,QWORD PTR[56+rsi] sbb rax,QWORD PTR[64+rsi] sbb rdx,QWORD PTR[72+rsi] sbb rbx,QWORD PTR[80+rsi] sbb rbp,QWORD PTR[88+rsi] sbb rdi,rdi mov QWORD PTR[((32+0))+rsp],r8 mov QWORD PTR[((32+8))+rsp],r9 mov QWORD PTR[((32+16))+rsp],r10 mov QWORD PTR[((32+24))+rsp],r11 mov QWORD PTR[((32+32))+rsp],r12 mov QWORD PTR[((32+40))+rsp],r13 mov QWORD PTR[((32+48))+rsp],r14 mov QWORD PTR[((32+56))+rsp],r15 mov QWORD PTR[((32+64))+rsp],rax mov QWORD PTR[((32+72))+rsp],rdx mov QWORD PTR[((32+80))+rsp],rbx mov QWORD PTR[((32+88))+rsp],rbp mov QWORD PTR[((32+96))+rsp],rdi lea rbx,QWORD PTR[48+rsi] mov rdx,QWORD PTR[48+rsi] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rax,QWORD PTR[16+rsi] mov r12,QWORD PTR[24+rsi] mov rdi,QWORD PTR[32+rsi] mov rbp,QWORD PTR[40+rsi] lea rsi,QWORD PTR[((-128))+rsi] lea rcx,QWORD PTR[((-128))+rcx] mulx r9,r8,r14 call __mulx_mont_383_nonred add rdx,rdx adc r15,r15 adc rax,rax adc r12,r12 adc rdi,rdi adc rbp,rbp mov QWORD PTR[48+rbx],rdx mov QWORD PTR[56+rbx],r15 mov QWORD PTR[64+rbx],rax mov QWORD PTR[72+rbx],r12 mov QWORD PTR[80+rbx],rdi mov QWORD PTR[88+rbx],rbp lea rsi,QWORD PTR[((32-128))+rsp] lea rbx,QWORD PTR[((32+48))+rsp] mov rdx,QWORD PTR[((32+48))+rsp] mov r14,QWORD PTR[((32+0))+rsp] mov r15,QWORD PTR[((32+8))+rsp] mov rax,QWORD PTR[((32+16))+rsp] mov r12,QWORD PTR[((32+24))+rsp] mov rdi,QWORD PTR[((32+32))+rsp] mov rbp,QWORD PTR[((32+40))+rsp] mulx r9,r8,r14 call __mulx_mont_383_nonred mov r14,QWORD PTR[((32+96))+rsp] lea rcx,QWORD PTR[128+rcx] mov r8,QWORD PTR[((32+0))+rsp] and r8,r14 mov r9,QWORD PTR[((32+8))+rsp] and r9,r14 mov r10,QWORD PTR[((32+16))+rsp] and r10,r14 mov r11,QWORD PTR[((32+24))+rsp] and r11,r14 mov r13,QWORD PTR[((32+32))+rsp] and r13,r14 and r14,QWORD PTR[((32+40))+rsp] sub rdx,r8 mov r8,QWORD PTR[rcx] sbb r15,r9 mov r9,QWORD PTR[8+rcx] sbb rax,r10 mov r10,QWORD PTR[16+rcx] sbb r12,r11 mov r11,QWORD PTR[24+rcx] sbb rdi,r13 mov r13,QWORD PTR[32+rcx] sbb rbp,r14 sbb r14,r14 and r8,r14 and r9,r14 and r10,r14 and r11,r14 and r13,r14 and r14,QWORD PTR[40+rcx] add rdx,r8 adc r15,r9 adc rax,r10 adc r12,r11 adc rdi,r13 adc rbp,r14 mov QWORD PTR[rbx],rdx mov QWORD PTR[8+rbx],r15 mov QWORD PTR[16+rbx],rax mov QWORD PTR[24+rbx],r12 mov QWORD PTR[32+rbx],rdi mov QWORD PTR[40+rbx],rbp lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] mov r14,QWORD PTR[8+r8] mov r13,QWORD PTR[16+r8] mov r12,QWORD PTR[24+r8] mov rbx,QWORD PTR[32+r8] mov rbp,QWORD PTR[40+r8] lea rsp,QWORD PTR[48+r8] $L$SEH_epilogue_sqrx_mont_382x:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_sqrx_mont_382x:: sqrx_mont_382x ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_mulx_mont_384x DD imagerel $L$SEH_body_mulx_mont_384x DD imagerel $L$SEH_info_mulx_mont_384x_prologue DD imagerel $L$SEH_body_mulx_mont_384x DD imagerel $L$SEH_epilogue_mulx_mont_384x DD imagerel $L$SEH_info_mulx_mont_384x_body DD imagerel $L$SEH_epilogue_mulx_mont_384x DD imagerel $L$SEH_end_mulx_mont_384x DD imagerel $L$SEH_info_mulx_mont_384x_epilogue DD imagerel $L$SEH_begin_sqrx_mont_384x DD imagerel $L$SEH_body_sqrx_mont_384x DD imagerel $L$SEH_info_sqrx_mont_384x_prologue DD imagerel $L$SEH_body_sqrx_mont_384x DD imagerel $L$SEH_epilogue_sqrx_mont_384x DD imagerel $L$SEH_info_sqrx_mont_384x_body DD imagerel $L$SEH_epilogue_sqrx_mont_384x DD imagerel $L$SEH_end_sqrx_mont_384x DD imagerel $L$SEH_info_sqrx_mont_384x_epilogue DD imagerel $L$SEH_begin_mulx_382x DD imagerel $L$SEH_body_mulx_382x DD imagerel $L$SEH_info_mulx_382x_prologue DD imagerel $L$SEH_body_mulx_382x DD imagerel $L$SEH_epilogue_mulx_382x DD imagerel $L$SEH_info_mulx_382x_body DD imagerel $L$SEH_epilogue_mulx_382x DD imagerel $L$SEH_end_mulx_382x DD imagerel $L$SEH_info_mulx_382x_epilogue DD imagerel $L$SEH_begin_sqrx_382x DD imagerel $L$SEH_body_sqrx_382x DD imagerel $L$SEH_info_sqrx_382x_prologue DD imagerel $L$SEH_body_sqrx_382x DD imagerel $L$SEH_epilogue_sqrx_382x DD imagerel $L$SEH_info_sqrx_382x_body DD imagerel $L$SEH_epilogue_sqrx_382x DD imagerel $L$SEH_end_sqrx_382x DD imagerel $L$SEH_info_sqrx_382x_epilogue DD imagerel $L$SEH_begin_mulx_384 DD imagerel $L$SEH_body_mulx_384 DD imagerel $L$SEH_info_mulx_384_prologue DD imagerel $L$SEH_body_mulx_384 DD imagerel $L$SEH_epilogue_mulx_384 DD imagerel $L$SEH_info_mulx_384_body DD imagerel $L$SEH_epilogue_mulx_384 DD imagerel $L$SEH_end_mulx_384 DD imagerel $L$SEH_info_mulx_384_epilogue DD imagerel $L$SEH_begin_sqrx_384 DD imagerel $L$SEH_body_sqrx_384 DD imagerel $L$SEH_info_sqrx_384_prologue DD imagerel $L$SEH_body_sqrx_384 DD imagerel $L$SEH_epilogue_sqrx_384 DD imagerel $L$SEH_info_sqrx_384_body DD imagerel $L$SEH_epilogue_sqrx_384 DD imagerel $L$SEH_end_sqrx_384 DD imagerel $L$SEH_info_sqrx_384_epilogue DD imagerel $L$SEH_begin_redcx_mont_384 DD imagerel $L$SEH_body_redcx_mont_384 DD imagerel $L$SEH_info_redcx_mont_384_prologue DD imagerel $L$SEH_body_redcx_mont_384 DD imagerel $L$SEH_epilogue_redcx_mont_384 DD imagerel $L$SEH_info_redcx_mont_384_body DD imagerel $L$SEH_epilogue_redcx_mont_384 DD imagerel $L$SEH_end_redcx_mont_384 DD imagerel $L$SEH_info_redcx_mont_384_epilogue DD imagerel $L$SEH_begin_fromx_mont_384 DD imagerel $L$SEH_body_fromx_mont_384 DD imagerel $L$SEH_info_fromx_mont_384_prologue DD imagerel $L$SEH_body_fromx_mont_384 DD imagerel $L$SEH_epilogue_fromx_mont_384 DD imagerel $L$SEH_info_fromx_mont_384_body DD imagerel $L$SEH_epilogue_fromx_mont_384 DD imagerel $L$SEH_end_fromx_mont_384 DD imagerel $L$SEH_info_fromx_mont_384_epilogue DD imagerel $L$SEH_begin_sgn0x_pty_mont_384 DD imagerel $L$SEH_body_sgn0x_pty_mont_384 DD imagerel $L$SEH_info_sgn0x_pty_mont_384_prologue DD imagerel $L$SEH_body_sgn0x_pty_mont_384 DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 DD imagerel $L$SEH_info_sgn0x_pty_mont_384_body DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 DD imagerel $L$SEH_end_sgn0x_pty_mont_384 DD imagerel $L$SEH_info_sgn0x_pty_mont_384_epilogue DD imagerel $L$SEH_begin_sgn0x_pty_mont_384x DD imagerel $L$SEH_body_sgn0x_pty_mont_384x DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_prologue DD imagerel $L$SEH_body_sgn0x_pty_mont_384x DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_body DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x DD imagerel $L$SEH_end_sgn0x_pty_mont_384x DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_epilogue DD imagerel $L$SEH_begin_mulx_mont_384 DD imagerel $L$SEH_body_mulx_mont_384 DD imagerel $L$SEH_info_mulx_mont_384_prologue DD imagerel $L$SEH_body_mulx_mont_384 DD imagerel $L$SEH_epilogue_mulx_mont_384 DD imagerel $L$SEH_info_mulx_mont_384_body DD imagerel $L$SEH_epilogue_mulx_mont_384 DD imagerel $L$SEH_end_mulx_mont_384 DD imagerel $L$SEH_info_mulx_mont_384_epilogue DD imagerel $L$SEH_begin_sqrx_mont_384 DD imagerel $L$SEH_body_sqrx_mont_384 DD imagerel $L$SEH_info_sqrx_mont_384_prologue DD imagerel $L$SEH_body_sqrx_mont_384 DD imagerel $L$SEH_epilogue_sqrx_mont_384 DD imagerel $L$SEH_info_sqrx_mont_384_body DD imagerel $L$SEH_epilogue_sqrx_mont_384 DD imagerel $L$SEH_end_sqrx_mont_384 DD imagerel $L$SEH_info_sqrx_mont_384_epilogue DD imagerel $L$SEH_begin_sqrx_n_mul_mont_384 DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_prologue DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_body DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 DD imagerel $L$SEH_end_sqrx_n_mul_mont_384 DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_epilogue DD imagerel $L$SEH_begin_sqrx_n_mul_mont_383 DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_prologue DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_body DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 DD imagerel $L$SEH_end_sqrx_n_mul_mont_383 DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_epilogue DD imagerel $L$SEH_begin_sqrx_mont_382x DD imagerel $L$SEH_body_sqrx_mont_382x DD imagerel $L$SEH_info_sqrx_mont_382x_prologue DD imagerel $L$SEH_body_sqrx_mont_382x DD imagerel $L$SEH_epilogue_sqrx_mont_382x DD imagerel $L$SEH_info_sqrx_mont_382x_body DD imagerel $L$SEH_epilogue_sqrx_mont_382x DD imagerel $L$SEH_end_sqrx_mont_382x DD imagerel $L$SEH_info_sqrx_mont_382x_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_mulx_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mulx_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,029h,000h DB 000h,0e4h,02ah,000h DB 000h,0d4h,02bh,000h DB 000h,0c4h,02ch,000h DB 000h,034h,02dh,000h DB 000h,054h,02eh,000h DB 000h,074h,030h,000h DB 000h,064h,031h,000h DB 000h,001h,02fh,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mulx_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_382x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mulx_384_body:: DB 1,0,17,0 DB 000h,0f4h,000h,000h DB 000h,0e4h,001h,000h DB 000h,0d4h,002h,000h DB 000h,0c4h,003h,000h DB 000h,034h,004h,000h DB 000h,054h,005h,000h DB 000h,074h,007h,000h DB 000h,064h,008h,000h DB 000h,052h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_redcx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_redcx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_redcx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_fromx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_fromx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_fromx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0x_pty_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sgn0x_pty_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0x_pty_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0x_pty_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sgn0x_pty_mont_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h DB 000h,0e4h,002h,000h DB 000h,0d4h,003h,000h DB 000h,0c4h,004h,000h DB 000h,034h,005h,000h DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sgn0x_pty_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_mulx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h DB 000h,0e4h,004h,000h DB 000h,0d4h,005h,000h DB 000h,0c4h,006h,000h DB 000h,034h,007h,000h DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_mulx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h DB 000h,0e4h,004h,000h DB 000h,0d4h,005h,000h DB 000h,0c4h,006h,000h DB 000h,034h,007h,000h DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_n_mul_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_n_mul_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,005h,000h DB 000h,0e4h,006h,000h DB 000h,0d4h,007h,000h DB 000h,0c4h,008h,000h DB 000h,034h,009h,000h DB 000h,054h,00ah,000h DB 000h,074h,00ch,000h DB 000h,064h,00dh,000h DB 000h,0a2h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_n_mul_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_n_mul_mont_383_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_n_mul_mont_383_body:: DB 1,0,17,0 DB 000h,0f4h,005h,000h DB 000h,0e4h,006h,000h DB 000h,0d4h,007h,000h DB 000h,0c4h,008h,000h DB 000h,034h,009h,000h DB 000h,054h,00ah,000h DB 000h,074h,00ch,000h DB 000h,064h,00dh,000h DB 000h,0a2h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_n_mul_mont_383_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 DB 0,0b3h DB 0,0 DD 0,0 $L$SEH_info_sqrx_mont_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h DB 000h,0e4h,012h,000h DB 000h,0d4h,013h,000h DB 000h,0c4h,014h,000h DB 000h,034h,015h,000h DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h DB 000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build/win64/sha256-armv8.asm ================================================ GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA 64/8 COMMON |__blst_platform_cap|,4 AREA |.text|,CODE,ALIGN=8,ARM64 ALIGN 64 |$LK256| DCDU 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 DCDU 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 DCDU 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 DCDU 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 DCDU 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc DCDU 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da DCDU 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 DCDU 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 DCDU 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 DCDU 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 DCDU 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 DCDU 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 DCDU 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 DCDU 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 DCDU 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 DCDU 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 DCDU 0 DCB "SHA256 block transform for ARMv8, CRYPTOGAMS by @dot-asm",0 ALIGN 4 ALIGN 4 EXPORT |blst_sha256_block_armv8|[FUNC] ALIGN 64 |blst_sha256_block_armv8| PROC hint #34 |$Lv8_entry| stp x29,x30,[sp,#-2*__SIZEOF_POINTER__]! add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] adr x3,|$LK256| |$Loop_hw| ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 ld1 {v16.4s},[x3],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s DCDU 0x5e2828a4 orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 DCDU 0x5e0760c4 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s DCDU 0x5e2828c5 orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 DCDU 0x5e0460e5 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s DCDU 0x5e2828e6 orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 DCDU 0x5e056086 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s DCDU 0x5e282887 orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 DCDU 0x5e0660a7 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s DCDU 0x5e2828a4 orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 DCDU 0x5e0760c4 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s DCDU 0x5e2828c5 orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 DCDU 0x5e0460e5 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s DCDU 0x5e2828e6 orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 DCDU 0x5e056086 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s DCDU 0x5e282887 orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 DCDU 0x5e0660a7 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s DCDU 0x5e2828a4 orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 DCDU 0x5e0760c4 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s DCDU 0x5e2828c5 orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 DCDU 0x5e0460e5 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s DCDU 0x5e2828e6 orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 DCDU 0x5e056086 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s DCDU 0x5e282887 orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 DCDU 0x5e0660a7 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s sub x3,x3,#64*4-16 orr v2.16b,v0.16b,v0.16b DCDU 0x5e104020 DCDU 0x5e105041 add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b DCDU 0x5e114020 DCDU 0x5e115041 add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s cbnz x2,|$Loop_hw| st1 {v0.4s,v1.4s},[x0] ldr x29,[sp],#2*__SIZEOF_POINTER__ ret ENDP EXPORT |blst_sha256_block_data_order|[FUNC] ALIGN 16 |blst_sha256_block_data_order| PROC hint #34 adrp x16,__blst_platform_cap ldr w16,[x16,__blst_platform_cap] tst w16,#1 bne |$Lv8_entry| stp x29, x30, [sp, #-2*__SIZEOF_POINTER__]! mov x29, sp sub sp,sp,#16*4 adr x16,|$LK256| add x2,x1,x2,lsl#6 ld1 {v0.16b},[x1], #16 ld1 {v1.16b},[x1], #16 ld1 {v2.16b},[x1], #16 ld1 {v3.16b},[x1], #16 ld1 {v4.4s},[x16], #16 ld1 {v5.4s},[x16], #16 ld1 {v6.4s},[x16], #16 ld1 {v7.4s},[x16], #16 rev32 v0.16b,v0.16b rev32 v1.16b,v1.16b rev32 v2.16b,v2.16b rev32 v3.16b,v3.16b mov x17,sp add v4.4s,v4.4s,v0.4s add v5.4s,v5.4s,v1.4s add v6.4s,v6.4s,v2.4s st1 {v4.4s,v5.4s},[x17], #32 add v7.4s,v7.4s,v3.4s st1 {v6.4s,v7.4s},[x17] sub x17,x17,#32 ldp w3,w4,[x0] ldp w5,w6,[x0,#8] ldp w7,w8,[x0,#16] ldp w9,w10,[x0,#24] ldr w12,[sp,#0] mov w13,wzr eor w14,w4,w5 mov w15,wzr b |$L_00_48| ALIGN 16 |$L_00_48| ext8 v4.16b,v0.16b,v1.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext8 v7.16b,v2.16b,v3.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v3.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v0.4s,v0.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v0.4s,v0.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v0.4s,v0.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v0.4s,#17 orr w12,w12,w15 ushr v19.4s,v0.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v0.4s,#15 add w8,w8,w12 ushr v17.4s,v0.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v0.4s,#13 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v0.4s,v0.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v0.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext8 v4.16b,v1.16b,v2.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext8 v7.16b,v3.16b,v0.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v0.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v1.4s,v1.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v1.4s,v1.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v1.4s,v1.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v1.4s,#17 orr w12,w12,w15 ushr v19.4s,v1.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v1.4s,#15 add w4,w4,w12 ushr v17.4s,v1.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v1.4s,#13 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v1.4s,v1.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v1.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 ext8 v4.16b,v2.16b,v3.16b,#4 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 bic w15,w9,w7 ext8 v7.16b,v0.16b,v1.16b,#4 eor w11,w7,w7,ror#5 add w3,w3,w13 mov d19,v1.d[1] orr w12,w12,w15 eor w11,w11,w7,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w3,w3,ror#11 ushr v5.4s,v4.4s,#3 add w10,w10,w12 add v2.4s,v2.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w3,w4 eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w6,w6,w10 sli v7.4s,v4.4s,#14 eor w14,w14,w4 ushr v16.4s,v19.4s,#17 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 eor v5.16b,v5.16b,v7.16b bic w15,w8,w6 eor w11,w6,w6,ror#5 sli v16.4s,v19.4s,#15 add w10,w10,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 ushr v7.4s,v19.4s,#19 add w9,w9,w12 ror w11,w11,#6 add v2.4s,v2.4s,v5.4s eor w14,w10,w3 eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 eor v17.16b,v17.16b,v7.16b add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 add v2.4s,v2.4s,v17.4s bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 ushr v18.4s,v2.4s,#17 orr w12,w12,w15 ushr v19.4s,v2.4s,#10 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 sli v18.4s,v2.4s,#15 add w8,w8,w12 ushr v17.4s,v2.4s,#19 ror w11,w11,#6 eor w13,w9,w10 eor v19.16b,v19.16b,v18.16b eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v2.4s,#13 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 eor v17.16b,v17.16b,v17.16b add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 mov v17.d[1],v19.d[0] bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 add v2.4s,v2.4s,v17.4s orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add v4.4s,v4.4s,v2.4s add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 ext8 v4.16b,v3.16b,v0.16b,#4 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 bic w15,w5,w3 ext8 v7.16b,v1.16b,v2.16b,#4 eor w11,w3,w3,ror#5 add w7,w7,w13 mov d19,v2.d[1] orr w12,w12,w15 eor w11,w11,w3,ror#19 ushr v6.4s,v4.4s,#7 eor w15,w7,w7,ror#11 ushr v5.4s,v4.4s,#3 add w6,w6,w12 add v3.4s,v3.4s,v7.4s ror w11,w11,#6 sli v6.4s,v4.4s,#25 eor w13,w7,w8 eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 add w10,w10,w6 sli v7.4s,v4.4s,#14 eor w14,w14,w8 ushr v16.4s,v19.4s,#17 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 eor v5.16b,v5.16b,v7.16b bic w15,w4,w10 eor w11,w10,w10,ror#5 sli v16.4s,v19.4s,#15 add w6,w6,w14 orr w12,w12,w15 ushr v17.4s,v19.4s,#10 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 ushr v7.4s,v19.4s,#19 add w5,w5,w12 ror w11,w11,#6 add v3.4s,v3.4s,v5.4s eor w14,w6,w7 eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 eor v17.16b,v17.16b,v7.16b add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 add v3.4s,v3.4s,v17.4s bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 ushr v18.4s,v3.4s,#17 orr w12,w12,w15 ushr v19.4s,v3.4s,#10 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 sli v18.4s,v3.4s,#15 add w4,w4,w12 ushr v17.4s,v3.4s,#19 ror w11,w11,#6 eor w13,w5,w6 eor v19.16b,v19.16b,v18.16b eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v3.4s,#13 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 ld1 {v4.4s},[x16], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 eor v17.16b,v17.16b,v17.16b add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 mov v17.d[1],v19.d[0] bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 add v3.4s,v3.4s,v17.4s orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add v4.4s,v4.4s,v3.4s add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[x16] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 cmp w12,#0 ldr w12,[sp,#0] sub x17,x17,#64 bne |$L_00_48| sub x16,x16,#256 cmp x1,x2 mov x17, #-64 cseleq x17,x17,xzr add x1,x1,x17 mov x17,sp add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v0.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v0.16b,v0.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v0.4s add w10,w10,w11 ldr w12,[sp,#4] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#8] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#12] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v1.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v1.16b,v1.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v1.4s add w6,w6,w11 ldr w12,[sp,#20] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#24] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#28] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 ldr w12,[sp,#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 ld1 {v2.16b},[x1],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 ld1 {v4.4s},[x16],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 eor w15,w3,w3,ror#11 rev32 v2.16b,v2.16b add w10,w10,w12 ror w11,w11,#6 eor w13,w3,w4 eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v2.4s add w10,w10,w11 ldr w12,[sp,#36] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 eor w14,w14,w4 add w9,w9,w12 add w10,w10,w15 and w12,w7,w6 bic w15,w8,w6 eor w11,w6,w6,ror#5 add w10,w10,w14 orr w12,w12,w15 eor w11,w11,w6,ror#19 eor w15,w10,w10,ror#11 add w9,w9,w12 ror w11,w11,#6 eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 ldr w12,[sp,#40] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 eor w13,w13,w3 add w8,w8,w12 add w9,w9,w15 and w12,w6,w5 bic w15,w7,w5 eor w11,w5,w5,ror#5 add w9,w9,w13 orr w12,w12,w15 eor w11,w11,w5,ror#19 eor w15,w9,w9,ror#11 add w8,w8,w12 ror w11,w11,#6 eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 ldr w12,[sp,#44] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 eor w14,w14,w10 add w7,w7,w12 add w8,w8,w15 and w12,w5,w4 bic w15,w6,w4 eor w11,w4,w4,ror#5 add w8,w8,w14 orr w12,w12,w15 eor w11,w11,w4,ror#19 eor w15,w8,w8,ror#11 add w7,w7,w12 ror w11,w11,#6 eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 ldr w12,[sp,#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 st1 {v4.4s},[x17], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 ld1 {v3.16b},[x1],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 ld1 {v4.4s},[x16],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 eor w15,w7,w7,ror#11 rev32 v3.16b,v3.16b add w6,w6,w12 ror w11,w11,#6 eor w13,w7,w8 eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v3.4s add w6,w6,w11 ldr w12,[sp,#52] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 eor w14,w14,w8 add w5,w5,w12 add w6,w6,w15 and w12,w3,w10 bic w15,w4,w10 eor w11,w10,w10,ror#5 add w6,w6,w14 orr w12,w12,w15 eor w11,w11,w10,ror#19 eor w15,w6,w6,ror#11 add w5,w5,w12 ror w11,w11,#6 eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 ldr w12,[sp,#56] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 eor w13,w13,w7 add w4,w4,w12 add w5,w5,w15 and w12,w10,w9 bic w15,w3,w9 eor w11,w9,w9,ror#5 add w5,w5,w13 orr w12,w12,w15 eor w11,w11,w9,ror#19 eor w15,w5,w5,ror#11 add w4,w4,w12 ror w11,w11,#6 eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 ldr w12,[sp,#60] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 eor w14,w14,w6 add w3,w3,w12 add w4,w4,w15 and w12,w9,w8 bic w15,w10,w8 eor w11,w8,w8,ror#5 add w4,w4,w14 orr w12,w12,w15 eor w11,w11,w8,ror#19 eor w15,w4,w4,ror#11 add w3,w3,w12 ror w11,w11,#6 eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 add w3,w3,w15 ldp w11,w12,[x0,#0] add w3,w3,w13 ldp w13,w14,[x0,#8] add w3,w3,w11 add w4,w4,w12 ldp w11,w12,[x0,#16] add w5,w5,w13 add w6,w6,w14 ldp w13,w14,[x0,#24] add w7,w7,w11 add w8,w8,w12 ldr w12,[sp,#0] stp w3,w4,[x0,#0] add w9,w9,w13 mov w13,wzr stp w5,w6,[x0,#8] add w10,w10,w14 stp w7,w8,[x0,#16] eor w14,w4,w5 stp w9,w10,[x0,#24] mov w15,wzr mov x17,sp bne |$L_00_48| ldr x29,[x29] add sp,sp,#16*4+2*__SIZEOF_POINTER__ ret ENDP EXPORT |blst_sha256_emit|[FUNC] ALIGN 16 |blst_sha256_emit| PROC hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] if :lnot::def: __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 endif str w4,[x0,#4] lsr x4,x4,#32 str w5,[x0,#12] lsr x5,x5,#32 str w6,[x0,#20] lsr x6,x6,#32 str w7,[x0,#28] lsr x7,x7,#32 str w4,[x0,#0] str w5,[x0,#8] str w6,[x0,#16] str w7,[x0,#24] ret ENDP EXPORT |blst_sha256_bcopy|[FUNC] ALIGN 16 |blst_sha256_bcopy| PROC hint #34 |$Loop_bcopy| ldrb w3,[x1],#1 sub x2,x2,#1 strb w3,[x0],#1 cbnz x2,|$Loop_bcopy| ret ENDP EXPORT |blst_sha256_hcopy|[FUNC] ALIGN 16 |blst_sha256_hcopy| PROC hint #34 ldp x4,x5,[x1] ldp x6,x7,[x1,#16] stp x4,x5,[x0] stp x6,x7,[x0,#16] ret ENDP END ================================================ FILE: build/win64/sha256-x86_64.asm ================================================ OPTION DOTNAME _DATA SEGMENT COMM __blst_platform_cap:DWORD:1 _DATA ENDS .rdata SEGMENT READONLY ALIGN(256) ALIGN 64 K256:: DD 0428a2f98h,071374491h,0b5c0fbcfh,0e9b5dba5h DD 03956c25bh,059f111f1h,0923f82a4h,0ab1c5ed5h DD 0d807aa98h,012835b01h,0243185beh,0550c7dc3h DD 072be5d74h,080deb1feh,09bdc06a7h,0c19bf174h DD 0e49b69c1h,0efbe4786h,00fc19dc6h,0240ca1cch DD 02de92c6fh,04a7484aah,05cb0a9dch,076f988dah DD 0983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h DD 0c6e00bf3h,0d5a79147h,006ca6351h,014292967h DD 027b70a85h,02e1b2138h,04d2c6dfch,053380d13h DD 0650a7354h,0766a0abbh,081c2c92eh,092722c85h DD 0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h DD 0d192e819h,0d6990624h,0f40e3585h,0106aa070h DD 019a4c116h,01e376c08h,02748774ch,034b0bcb5h DD 0391c0cb3h,04ed8aa4ah,05b9cca4fh,0682e6ff3h DD 0748f82eeh,078a5636fh,084c87814h,08cc70208h DD 090befffah,0a4506cebh,0bef9a3f7h,0c67178f2h DD 000010203h,004050607h,008090a0bh,00c0d0e0fh DD 003020100h,00b0a0908h,0ffffffffh,0ffffffffh DD 0ffffffffh,0ffffffffh,003020100h,00b0a0908h DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,64,100,111,116,45,97,115,109,0 .rdata ENDS .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC blst_sha256_block_data_order_shaext ALIGN 64 blst_sha256_block_data_order_shaext PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_blst_sha256_block_data_order_shaext:: push rbp mov rbp,rsp mov rdi,rcx mov rsi,rdx mov rdx,r8 $L$blst_sha256_block_data_order$2:: sub rsp,050h movaps XMMWORD PTR[(-80)+rbp],xmm6 movaps XMMWORD PTR[(-64)+rbp],xmm7 movaps XMMWORD PTR[(-48)+rbp],xmm8 movaps XMMWORD PTR[(-32)+rbp],xmm9 movaps XMMWORD PTR[(-16)+rbp],xmm10 $L$SEH_body_blst_sha256_block_data_order_shaext:: ifdef __SGX_LVI_HARDENING__ lfence endif lea rcx,QWORD PTR[((K256+128))] movdqu xmm1,XMMWORD PTR[rdi] movdqu xmm2,XMMWORD PTR[16+rdi] movdqa xmm7,XMMWORD PTR[((256-128))+rcx] pshufd xmm0,xmm1,01bh pshufd xmm1,xmm1,0b1h pshufd xmm2,xmm2,01bh movdqa xmm8,xmm7 DB 102,15,58,15,202,8 punpcklqdq xmm2,xmm0 jmp $L$oop_shaext ALIGN 16 $L$oop_shaext:: movdqu xmm3,XMMWORD PTR[rsi] movdqu xmm4,XMMWORD PTR[16+rsi] movdqu xmm5,XMMWORD PTR[32+rsi] DB 102,15,56,0,223 movdqu xmm6,XMMWORD PTR[48+rsi] movdqa xmm0,XMMWORD PTR[((0-128))+rcx] paddd xmm0,xmm3 DB 102,15,56,0,231 movdqa xmm10,xmm2 DB 15,56,203,209 pshufd xmm0,xmm0,00eh nop movdqa xmm9,xmm1 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((16-128))+rcx] paddd xmm0,xmm4 DB 102,15,56,0,239 DB 15,56,203,209 pshufd xmm0,xmm0,00eh lea rsi,QWORD PTR[64+rsi] DB 15,56,204,220 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((32-128))+rcx] paddd xmm0,xmm5 DB 102,15,56,0,247 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm6 DB 102,15,58,15,253,4 nop paddd xmm3,xmm7 DB 15,56,204,229 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((48-128))+rcx] paddd xmm0,xmm6 DB 15,56,205,222 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm3 DB 102,15,58,15,254,4 nop paddd xmm4,xmm7 DB 15,56,204,238 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((64-128))+rcx] paddd xmm0,xmm3 DB 15,56,205,227 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm4 DB 102,15,58,15,251,4 nop paddd xmm5,xmm7 DB 15,56,204,243 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((80-128))+rcx] paddd xmm0,xmm4 DB 15,56,205,236 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm5 DB 102,15,58,15,252,4 nop paddd xmm6,xmm7 DB 15,56,204,220 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((96-128))+rcx] paddd xmm0,xmm5 DB 15,56,205,245 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm6 DB 102,15,58,15,253,4 nop paddd xmm3,xmm7 DB 15,56,204,229 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((112-128))+rcx] paddd xmm0,xmm6 DB 15,56,205,222 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm3 DB 102,15,58,15,254,4 nop paddd xmm4,xmm7 DB 15,56,204,238 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((128-128))+rcx] paddd xmm0,xmm3 DB 15,56,205,227 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm4 DB 102,15,58,15,251,4 nop paddd xmm5,xmm7 DB 15,56,204,243 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((144-128))+rcx] paddd xmm0,xmm4 DB 15,56,205,236 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm5 DB 102,15,58,15,252,4 nop paddd xmm6,xmm7 DB 15,56,204,220 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((160-128))+rcx] paddd xmm0,xmm5 DB 15,56,205,245 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm6 DB 102,15,58,15,253,4 nop paddd xmm3,xmm7 DB 15,56,204,229 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((176-128))+rcx] paddd xmm0,xmm6 DB 15,56,205,222 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm3 DB 102,15,58,15,254,4 nop paddd xmm4,xmm7 DB 15,56,204,238 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((192-128))+rcx] paddd xmm0,xmm3 DB 15,56,205,227 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm4 DB 102,15,58,15,251,4 nop paddd xmm5,xmm7 DB 15,56,204,243 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((208-128))+rcx] paddd xmm0,xmm4 DB 15,56,205,236 DB 15,56,203,209 pshufd xmm0,xmm0,00eh movdqa xmm7,xmm5 DB 102,15,58,15,252,4 DB 15,56,203,202 paddd xmm6,xmm7 movdqa xmm0,XMMWORD PTR[((224-128))+rcx] paddd xmm0,xmm5 DB 15,56,203,209 pshufd xmm0,xmm0,00eh DB 15,56,205,245 movdqa xmm7,xmm8 DB 15,56,203,202 movdqa xmm0,XMMWORD PTR[((240-128))+rcx] paddd xmm0,xmm6 nop DB 15,56,203,209 pshufd xmm0,xmm0,00eh dec rdx nop DB 15,56,203,202 paddd xmm2,xmm10 paddd xmm1,xmm9 jnz $L$oop_shaext pshufd xmm2,xmm2,0b1h pshufd xmm7,xmm1,01bh pshufd xmm1,xmm1,0b1h punpckhqdq xmm1,xmm2 DB 102,15,58,15,215,8 movdqu XMMWORD PTR[rdi],xmm1 movdqu XMMWORD PTR[16+rdi],xmm2 movaps xmm6,XMMWORD PTR[((-80))+rbp] movaps xmm7,XMMWORD PTR[((-64))+rbp] movaps xmm8,XMMWORD PTR[((-48))+rbp] movaps xmm9,XMMWORD PTR[((-32))+rbp] movaps xmm10,XMMWORD PTR[((-16))+rbp] mov rsp,rbp pop rbp $L$SEH_epilogue_blst_sha256_block_data_order_shaext:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_blst_sha256_block_data_order_shaext:: blst_sha256_block_data_order_shaext ENDP PUBLIC blst_sha256_block_data_order ALIGN 64 blst_sha256_block_data_order PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_blst_sha256_block_data_order:: push rbp mov rbp,rsp mov rdi,rcx mov rsi,rdx mov rdx,r8 ifndef __SGX_LVI_HARDENING__ test DWORD PTR[__blst_platform_cap],2 jnz $L$blst_sha256_block_data_order$2 endif push rbx push r12 push r13 push r14 push r15 shl rdx,4 sub rsp,88 lea rdx,QWORD PTR[rdx*4+rsi] mov QWORD PTR[((-64))+rbp],rdi mov QWORD PTR[((-48))+rbp],rdx movaps XMMWORD PTR[(-128)+rbp],xmm6 movaps XMMWORD PTR[(-112)+rbp],xmm7 movaps XMMWORD PTR[(-96)+rbp],xmm8 movaps XMMWORD PTR[(-80)+rbp],xmm9 $L$SEH_body_blst_sha256_block_data_order:: lea rsp,QWORD PTR[((-64))+rsp] ifdef __SGX_LVI_HARDENING__ lfence endif mov eax,DWORD PTR[rdi] and rsp,-64 mov ebx,DWORD PTR[4+rdi] mov ecx,DWORD PTR[8+rdi] mov edx,DWORD PTR[12+rdi] mov r8d,DWORD PTR[16+rdi] mov r9d,DWORD PTR[20+rdi] mov r10d,DWORD PTR[24+rdi] mov r11d,DWORD PTR[28+rdi] jmp $L$loop_ssse3 ALIGN 16 $L$loop_ssse3:: movdqa xmm7,XMMWORD PTR[((K256+256))] mov QWORD PTR[((-56))+rbp],rsi movdqu xmm0,XMMWORD PTR[rsi] movdqu xmm1,XMMWORD PTR[16+rsi] movdqu xmm2,XMMWORD PTR[32+rsi] DB 102,15,56,0,199 movdqu xmm3,XMMWORD PTR[48+rsi] lea rsi,QWORD PTR[K256] DB 102,15,56,0,207 movdqa xmm4,XMMWORD PTR[rsi] movdqa xmm5,XMMWORD PTR[16+rsi] DB 102,15,56,0,215 paddd xmm4,xmm0 movdqa xmm6,XMMWORD PTR[32+rsi] DB 102,15,56,0,223 movdqa xmm7,XMMWORD PTR[48+rsi] paddd xmm5,xmm1 paddd xmm6,xmm2 paddd xmm7,xmm3 movdqa XMMWORD PTR[rsp],xmm4 mov r14d,eax movdqa XMMWORD PTR[16+rsp],xmm5 mov edi,ebx movdqa XMMWORD PTR[32+rsp],xmm6 xor edi,ecx movdqa XMMWORD PTR[48+rsp],xmm7 mov r13d,r8d jmp $L$ssse3_00_47 ALIGN 16 $L$ssse3_00_47:: sub rsi,-64 ror r13d,14 movdqa xmm4,xmm1 mov eax,r14d mov r12d,r9d movdqa xmm7,xmm3 ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax DB 102,15,58,15,224,4 and r12d,r8d xor r13d,r8d DB 102,15,58,15,250,4 add r11d,DWORD PTR[rsp] mov r15d,eax xor r12d,r10d ror r14d,11 movdqa xmm5,xmm4 xor r15d,ebx add r11d,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,eax add r11d,r13d xor edi,ebx paddd xmm0,xmm7 ror r14d,2 add edx,r11d psrld xmm6,7 add r11d,edi mov r13d,edx pshufd xmm7,xmm3,250 add r14d,r11d ror r13d,14 pslld xmm5,14 mov r11d,r14d mov r12d,r8d pxor xmm4,xmm6 ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 psrld xmm6,11 xor r14d,r11d pxor xmm4,xmm5 and r12d,edx xor r13d,edx pslld xmm5,11 add r10d,DWORD PTR[4+rsp] mov edi,r11d pxor xmm4,xmm6 xor r12d,r9d ror r14d,11 movdqa xmm6,xmm7 xor edi,eax add r10d,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,r11d psrld xmm7,10 add r10d,r13d xor r15d,eax paddd xmm0,xmm4 ror r14d,2 add ecx,r10d psrlq xmm6,17 add r10d,r15d mov r13d,ecx add r14d,r10d pxor xmm7,xmm6 ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 psrlq xmm6,2 xor r13d,ecx xor r12d,r8d pxor xmm7,xmm6 ror r13d,5 xor r14d,r10d and r12d,ecx pshufd xmm7,xmm7,128 xor r13d,ecx add r9d,DWORD PTR[8+rsp] mov r15d,r10d psrldq xmm7,8 xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 paddd xmm0,xmm7 and edi,r15d xor r14d,r10d add r9d,r13d pshufd xmm7,xmm0,80 xor edi,r11d ror r14d,2 add ebx,r9d movdqa xmm6,xmm7 add r9d,edi mov r13d,ebx psrld xmm7,10 add r14d,r9d ror r13d,14 psrlq xmm6,17 mov r9d,r14d mov r12d,ecx pxor xmm7,xmm6 ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d psrlq xmm6,2 and r12d,ebx xor r13d,ebx add r8d,DWORD PTR[12+rsp] pxor xmm7,xmm6 mov edi,r9d xor r12d,edx ror r14d,11 pshufd xmm7,xmm7,8 xor edi,r10d add r8d,r12d movdqa xmm6,XMMWORD PTR[rsi] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,r9d add r8d,r13d xor r15d,r10d paddd xmm0,xmm7 ror r14d,2 add eax,r8d add r8d,r15d paddd xmm6,xmm0 mov r13d,eax add r14d,r8d movdqa XMMWORD PTR[rsp],xmm6 ror r13d,14 movdqa xmm4,xmm2 mov r8d,r14d mov r12d,ebx movdqa xmm7,xmm0 ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d DB 102,15,58,15,225,4 and r12d,eax xor r13d,eax DB 102,15,58,15,251,4 add edx,DWORD PTR[16+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 movdqa xmm5,xmm4 xor r15d,r9d add edx,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,r8d add edx,r13d xor edi,r9d paddd xmm1,xmm7 ror r14d,2 add r11d,edx psrld xmm6,7 add edx,edi mov r13d,r11d pshufd xmm7,xmm0,250 add r14d,edx ror r13d,14 pslld xmm5,14 mov edx,r14d mov r12d,eax pxor xmm4,xmm6 ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 psrld xmm6,11 xor r14d,edx pxor xmm4,xmm5 and r12d,r11d xor r13d,r11d pslld xmm5,11 add ecx,DWORD PTR[20+rsp] mov edi,edx pxor xmm4,xmm6 xor r12d,ebx ror r14d,11 movdqa xmm6,xmm7 xor edi,r8d add ecx,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,edx psrld xmm7,10 add ecx,r13d xor r15d,r8d paddd xmm1,xmm4 ror r14d,2 add r10d,ecx psrlq xmm6,17 add ecx,r15d mov r13d,r10d add r14d,ecx pxor xmm7,xmm6 ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 psrlq xmm6,2 xor r13d,r10d xor r12d,eax pxor xmm7,xmm6 ror r13d,5 xor r14d,ecx and r12d,r10d pshufd xmm7,xmm7,128 xor r13d,r10d add ebx,DWORD PTR[24+rsp] mov r15d,ecx psrldq xmm7,8 xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 paddd xmm1,xmm7 and edi,r15d xor r14d,ecx add ebx,r13d pshufd xmm7,xmm1,80 xor edi,edx ror r14d,2 add r9d,ebx movdqa xmm6,xmm7 add ebx,edi mov r13d,r9d psrld xmm7,10 add r14d,ebx ror r13d,14 psrlq xmm6,17 mov ebx,r14d mov r12d,r10d pxor xmm7,xmm6 ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx psrlq xmm6,2 and r12d,r9d xor r13d,r9d add eax,DWORD PTR[28+rsp] pxor xmm7,xmm6 mov edi,ebx xor r12d,r11d ror r14d,11 pshufd xmm7,xmm7,8 xor edi,ecx add eax,r12d movdqa xmm6,XMMWORD PTR[16+rsi] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,ebx add eax,r13d xor r15d,ecx paddd xmm1,xmm7 ror r14d,2 add r8d,eax add eax,r15d paddd xmm6,xmm1 mov r13d,r8d add r14d,eax movdqa XMMWORD PTR[16+rsp],xmm6 ror r13d,14 movdqa xmm4,xmm3 mov eax,r14d mov r12d,r9d movdqa xmm7,xmm1 ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax DB 102,15,58,15,226,4 and r12d,r8d xor r13d,r8d DB 102,15,58,15,248,4 add r11d,DWORD PTR[32+rsp] mov r15d,eax xor r12d,r10d ror r14d,11 movdqa xmm5,xmm4 xor r15d,ebx add r11d,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,eax add r11d,r13d xor edi,ebx paddd xmm2,xmm7 ror r14d,2 add edx,r11d psrld xmm6,7 add r11d,edi mov r13d,edx pshufd xmm7,xmm1,250 add r14d,r11d ror r13d,14 pslld xmm5,14 mov r11d,r14d mov r12d,r8d pxor xmm4,xmm6 ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 psrld xmm6,11 xor r14d,r11d pxor xmm4,xmm5 and r12d,edx xor r13d,edx pslld xmm5,11 add r10d,DWORD PTR[36+rsp] mov edi,r11d pxor xmm4,xmm6 xor r12d,r9d ror r14d,11 movdqa xmm6,xmm7 xor edi,eax add r10d,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,r11d psrld xmm7,10 add r10d,r13d xor r15d,eax paddd xmm2,xmm4 ror r14d,2 add ecx,r10d psrlq xmm6,17 add r10d,r15d mov r13d,ecx add r14d,r10d pxor xmm7,xmm6 ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 psrlq xmm6,2 xor r13d,ecx xor r12d,r8d pxor xmm7,xmm6 ror r13d,5 xor r14d,r10d and r12d,ecx pshufd xmm7,xmm7,128 xor r13d,ecx add r9d,DWORD PTR[40+rsp] mov r15d,r10d psrldq xmm7,8 xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 paddd xmm2,xmm7 and edi,r15d xor r14d,r10d add r9d,r13d pshufd xmm7,xmm2,80 xor edi,r11d ror r14d,2 add ebx,r9d movdqa xmm6,xmm7 add r9d,edi mov r13d,ebx psrld xmm7,10 add r14d,r9d ror r13d,14 psrlq xmm6,17 mov r9d,r14d mov r12d,ecx pxor xmm7,xmm6 ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d psrlq xmm6,2 and r12d,ebx xor r13d,ebx add r8d,DWORD PTR[44+rsp] pxor xmm7,xmm6 mov edi,r9d xor r12d,edx ror r14d,11 pshufd xmm7,xmm7,8 xor edi,r10d add r8d,r12d movdqa xmm6,XMMWORD PTR[32+rsi] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,r9d add r8d,r13d xor r15d,r10d paddd xmm2,xmm7 ror r14d,2 add eax,r8d add r8d,r15d paddd xmm6,xmm2 mov r13d,eax add r14d,r8d movdqa XMMWORD PTR[32+rsp],xmm6 ror r13d,14 movdqa xmm4,xmm0 mov r8d,r14d mov r12d,ebx movdqa xmm7,xmm2 ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d DB 102,15,58,15,227,4 and r12d,eax xor r13d,eax DB 102,15,58,15,249,4 add edx,DWORD PTR[48+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 movdqa xmm5,xmm4 xor r15d,r9d add edx,r12d movdqa xmm6,xmm4 ror r13d,6 and edi,r15d psrld xmm4,3 xor r14d,r8d add edx,r13d xor edi,r9d paddd xmm3,xmm7 ror r14d,2 add r11d,edx psrld xmm6,7 add edx,edi mov r13d,r11d pshufd xmm7,xmm2,250 add r14d,edx ror r13d,14 pslld xmm5,14 mov edx,r14d mov r12d,eax pxor xmm4,xmm6 ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 psrld xmm6,11 xor r14d,edx pxor xmm4,xmm5 and r12d,r11d xor r13d,r11d pslld xmm5,11 add ecx,DWORD PTR[52+rsp] mov edi,edx pxor xmm4,xmm6 xor r12d,ebx ror r14d,11 movdqa xmm6,xmm7 xor edi,r8d add ecx,r12d pxor xmm4,xmm5 ror r13d,6 and r15d,edi xor r14d,edx psrld xmm7,10 add ecx,r13d xor r15d,r8d paddd xmm3,xmm4 ror r14d,2 add r10d,ecx psrlq xmm6,17 add ecx,r15d mov r13d,r10d add r14d,ecx pxor xmm7,xmm6 ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 psrlq xmm6,2 xor r13d,r10d xor r12d,eax pxor xmm7,xmm6 ror r13d,5 xor r14d,ecx and r12d,r10d pshufd xmm7,xmm7,128 xor r13d,r10d add ebx,DWORD PTR[56+rsp] mov r15d,ecx psrldq xmm7,8 xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 paddd xmm3,xmm7 and edi,r15d xor r14d,ecx add ebx,r13d pshufd xmm7,xmm3,80 xor edi,edx ror r14d,2 add r9d,ebx movdqa xmm6,xmm7 add ebx,edi mov r13d,r9d psrld xmm7,10 add r14d,ebx ror r13d,14 psrlq xmm6,17 mov ebx,r14d mov r12d,r10d pxor xmm7,xmm6 ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx psrlq xmm6,2 and r12d,r9d xor r13d,r9d add eax,DWORD PTR[60+rsp] pxor xmm7,xmm6 mov edi,ebx xor r12d,r11d ror r14d,11 pshufd xmm7,xmm7,8 xor edi,ecx add eax,r12d movdqa xmm6,XMMWORD PTR[48+rsi] ror r13d,6 and r15d,edi pslldq xmm7,8 xor r14d,ebx add eax,r13d xor r15d,ecx paddd xmm3,xmm7 ror r14d,2 add r8d,eax add eax,r15d paddd xmm6,xmm3 mov r13d,r8d add r14d,eax movdqa XMMWORD PTR[48+rsp],xmm6 cmp BYTE PTR[67+rsi],0 jne $L$ssse3_00_47 ror r13d,14 mov eax,r14d mov r12d,r9d ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax and r12d,r8d xor r13d,r8d add r11d,DWORD PTR[rsp] mov r15d,eax xor r12d,r10d ror r14d,11 xor r15d,ebx add r11d,r12d ror r13d,6 and edi,r15d xor r14d,eax add r11d,r13d xor edi,ebx ror r14d,2 add edx,r11d add r11d,edi mov r13d,edx add r14d,r11d ror r13d,14 mov r11d,r14d mov r12d,r8d ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 xor r14d,r11d and r12d,edx xor r13d,edx add r10d,DWORD PTR[4+rsp] mov edi,r11d xor r12d,r9d ror r14d,11 xor edi,eax add r10d,r12d ror r13d,6 and r15d,edi xor r14d,r11d add r10d,r13d xor r15d,eax ror r14d,2 add ecx,r10d add r10d,r15d mov r13d,ecx add r14d,r10d ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 xor r13d,ecx xor r12d,r8d ror r13d,5 xor r14d,r10d and r12d,ecx xor r13d,ecx add r9d,DWORD PTR[8+rsp] mov r15d,r10d xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 and edi,r15d xor r14d,r10d add r9d,r13d xor edi,r11d ror r14d,2 add ebx,r9d add r9d,edi mov r13d,ebx add r14d,r9d ror r13d,14 mov r9d,r14d mov r12d,ecx ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d and r12d,ebx xor r13d,ebx add r8d,DWORD PTR[12+rsp] mov edi,r9d xor r12d,edx ror r14d,11 xor edi,r10d add r8d,r12d ror r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d ror r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d ror r13d,14 mov r8d,r14d mov r12d,ebx ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d and r12d,eax xor r13d,eax add edx,DWORD PTR[16+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 xor r15d,r9d add edx,r12d ror r13d,6 and edi,r15d xor r14d,r8d add edx,r13d xor edi,r9d ror r14d,2 add r11d,edx add edx,edi mov r13d,r11d add r14d,edx ror r13d,14 mov edx,r14d mov r12d,eax ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 xor r14d,edx and r12d,r11d xor r13d,r11d add ecx,DWORD PTR[20+rsp] mov edi,edx xor r12d,ebx ror r14d,11 xor edi,r8d add ecx,r12d ror r13d,6 and r15d,edi xor r14d,edx add ecx,r13d xor r15d,r8d ror r14d,2 add r10d,ecx add ecx,r15d mov r13d,r10d add r14d,ecx ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 xor r13d,r10d xor r12d,eax ror r13d,5 xor r14d,ecx and r12d,r10d xor r13d,r10d add ebx,DWORD PTR[24+rsp] mov r15d,ecx xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 and edi,r15d xor r14d,ecx add ebx,r13d xor edi,edx ror r14d,2 add r9d,ebx add ebx,edi mov r13d,r9d add r14d,ebx ror r13d,14 mov ebx,r14d mov r12d,r10d ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx and r12d,r9d xor r13d,r9d add eax,DWORD PTR[28+rsp] mov edi,ebx xor r12d,r11d ror r14d,11 xor edi,ecx add eax,r12d ror r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx ror r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax ror r13d,14 mov eax,r14d mov r12d,r9d ror r14d,9 xor r13d,r8d xor r12d,r10d ror r13d,5 xor r14d,eax and r12d,r8d xor r13d,r8d add r11d,DWORD PTR[32+rsp] mov r15d,eax xor r12d,r10d ror r14d,11 xor r15d,ebx add r11d,r12d ror r13d,6 and edi,r15d xor r14d,eax add r11d,r13d xor edi,ebx ror r14d,2 add edx,r11d add r11d,edi mov r13d,edx add r14d,r11d ror r13d,14 mov r11d,r14d mov r12d,r8d ror r14d,9 xor r13d,edx xor r12d,r9d ror r13d,5 xor r14d,r11d and r12d,edx xor r13d,edx add r10d,DWORD PTR[36+rsp] mov edi,r11d xor r12d,r9d ror r14d,11 xor edi,eax add r10d,r12d ror r13d,6 and r15d,edi xor r14d,r11d add r10d,r13d xor r15d,eax ror r14d,2 add ecx,r10d add r10d,r15d mov r13d,ecx add r14d,r10d ror r13d,14 mov r10d,r14d mov r12d,edx ror r14d,9 xor r13d,ecx xor r12d,r8d ror r13d,5 xor r14d,r10d and r12d,ecx xor r13d,ecx add r9d,DWORD PTR[40+rsp] mov r15d,r10d xor r12d,r8d ror r14d,11 xor r15d,r11d add r9d,r12d ror r13d,6 and edi,r15d xor r14d,r10d add r9d,r13d xor edi,r11d ror r14d,2 add ebx,r9d add r9d,edi mov r13d,ebx add r14d,r9d ror r13d,14 mov r9d,r14d mov r12d,ecx ror r14d,9 xor r13d,ebx xor r12d,edx ror r13d,5 xor r14d,r9d and r12d,ebx xor r13d,ebx add r8d,DWORD PTR[44+rsp] mov edi,r9d xor r12d,edx ror r14d,11 xor edi,r10d add r8d,r12d ror r13d,6 and r15d,edi xor r14d,r9d add r8d,r13d xor r15d,r10d ror r14d,2 add eax,r8d add r8d,r15d mov r13d,eax add r14d,r8d ror r13d,14 mov r8d,r14d mov r12d,ebx ror r14d,9 xor r13d,eax xor r12d,ecx ror r13d,5 xor r14d,r8d and r12d,eax xor r13d,eax add edx,DWORD PTR[48+rsp] mov r15d,r8d xor r12d,ecx ror r14d,11 xor r15d,r9d add edx,r12d ror r13d,6 and edi,r15d xor r14d,r8d add edx,r13d xor edi,r9d ror r14d,2 add r11d,edx add edx,edi mov r13d,r11d add r14d,edx ror r13d,14 mov edx,r14d mov r12d,eax ror r14d,9 xor r13d,r11d xor r12d,ebx ror r13d,5 xor r14d,edx and r12d,r11d xor r13d,r11d add ecx,DWORD PTR[52+rsp] mov edi,edx xor r12d,ebx ror r14d,11 xor edi,r8d add ecx,r12d ror r13d,6 and r15d,edi xor r14d,edx add ecx,r13d xor r15d,r8d ror r14d,2 add r10d,ecx add ecx,r15d mov r13d,r10d add r14d,ecx ror r13d,14 mov ecx,r14d mov r12d,r11d ror r14d,9 xor r13d,r10d xor r12d,eax ror r13d,5 xor r14d,ecx and r12d,r10d xor r13d,r10d add ebx,DWORD PTR[56+rsp] mov r15d,ecx xor r12d,eax ror r14d,11 xor r15d,edx add ebx,r12d ror r13d,6 and edi,r15d xor r14d,ecx add ebx,r13d xor edi,edx ror r14d,2 add r9d,ebx add ebx,edi mov r13d,r9d add r14d,ebx ror r13d,14 mov ebx,r14d mov r12d,r10d ror r14d,9 xor r13d,r9d xor r12d,r11d ror r13d,5 xor r14d,ebx and r12d,r9d xor r13d,r9d add eax,DWORD PTR[60+rsp] mov edi,ebx xor r12d,r11d ror r14d,11 xor edi,ecx add eax,r12d ror r13d,6 and r15d,edi xor r14d,ebx add eax,r13d xor r15d,ecx ror r14d,2 add r8d,eax add eax,r15d mov r13d,r8d add r14d,eax mov rdi,QWORD PTR[((-64))+rbp] mov eax,r14d mov rsi,QWORD PTR[((-56))+rbp] ifdef __SGX_LVI_HARDENING__ lfence endif add eax,DWORD PTR[rdi] add ebx,DWORD PTR[4+rdi] add ecx,DWORD PTR[8+rdi] add edx,DWORD PTR[12+rdi] add r8d,DWORD PTR[16+rdi] add r9d,DWORD PTR[20+rdi] add r10d,DWORD PTR[24+rdi] add r11d,DWORD PTR[28+rdi] lea rsi,QWORD PTR[64+rsi] cmp rsi,QWORD PTR[((-48))+rbp] mov DWORD PTR[rdi],eax mov DWORD PTR[4+rdi],ebx mov DWORD PTR[8+rdi],ecx mov DWORD PTR[12+rdi],edx mov DWORD PTR[16+rdi],r8d mov DWORD PTR[20+rdi],r9d mov DWORD PTR[24+rdi],r10d mov DWORD PTR[28+rdi],r11d jb $L$loop_ssse3 xorps xmm0,xmm0 movaps XMMWORD PTR[rsp],xmm0 movaps XMMWORD PTR[16+rsp],xmm0 movaps XMMWORD PTR[32+rsp],xmm0 movaps XMMWORD PTR[48+rsp],xmm0 movaps xmm6,XMMWORD PTR[((-128))+rbp] movaps xmm7,XMMWORD PTR[((-112))+rbp] movaps xmm8,XMMWORD PTR[((-96))+rbp] movaps xmm9,XMMWORD PTR[((-80))+rbp] mov r15,QWORD PTR[((-40))+rbp] mov r14,QWORD PTR[((-32))+rbp] mov r13,QWORD PTR[((-24))+rbp] mov r12,QWORD PTR[((-16))+rbp] mov rbx,QWORD PTR[((-8))+rbp] mov rsp,rbp pop rbp $L$SEH_epilogue_blst_sha256_block_data_order:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif $L$SEH_end_blst_sha256_block_data_order:: blst_sha256_block_data_order ENDP PUBLIC blst_sha256_emit ALIGN 16 blst_sha256_emit PROC PUBLIC DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rdx] mov r9,QWORD PTR[8+rdx] mov r10,QWORD PTR[16+rdx] bswap r8 mov r11,QWORD PTR[24+rdx] bswap r9 mov DWORD PTR[4+rcx],r8d bswap r10 mov DWORD PTR[12+rcx],r9d bswap r11 mov DWORD PTR[20+rcx],r10d shr r8,32 mov DWORD PTR[28+rcx],r11d shr r9,32 mov DWORD PTR[rcx],r8d shr r10,32 mov DWORD PTR[8+rcx],r9d shr r11,32 mov DWORD PTR[16+rcx],r10d mov DWORD PTR[24+rcx],r11d ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif blst_sha256_emit ENDP PUBLIC blst_sha256_bcopy ALIGN 16 blst_sha256_bcopy PROC PUBLIC DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif sub rcx,rdx $L$oop_bcopy:: movzx eax,BYTE PTR[rdx] lea rdx,QWORD PTR[1+rdx] mov BYTE PTR[((-1))+rdx*1+rcx],al dec r8 jnz $L$oop_bcopy ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif blst_sha256_bcopy ENDP PUBLIC blst_sha256_hcopy ALIGN 16 blst_sha256_hcopy PROC PUBLIC DB 243,15,30,250 ifdef __SGX_LVI_HARDENING__ lfence endif mov r8,QWORD PTR[rdx] mov r9,QWORD PTR[8+rdx] mov r10,QWORD PTR[16+rdx] mov r11,QWORD PTR[24+rdx] mov QWORD PTR[rcx],r8 mov QWORD PTR[8+rcx],r9 mov QWORD PTR[16+rcx],r10 mov QWORD PTR[24+rcx],r11 ifdef __SGX_LVI_HARDENING__ pop rdx lfence jmp rdx ud2 else DB 0F3h,0C3h endif blst_sha256_hcopy ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 DD imagerel $L$SEH_begin_blst_sha256_block_data_order_shaext DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_prologue DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_body DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext DD imagerel $L$SEH_end_blst_sha256_block_data_order_shaext DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue DD imagerel $L$SEH_begin_blst_sha256_block_data_order DD imagerel $L$SEH_body_blst_sha256_block_data_order DD imagerel $L$SEH_info_blst_sha256_block_data_order_prologue DD imagerel $L$SEH_body_blst_sha256_block_data_order DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order DD imagerel $L$SEH_info_blst_sha256_block_data_order_body DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order DD imagerel $L$SEH_end_blst_sha256_block_data_order DD imagerel $L$SEH_info_blst_sha256_block_data_order_epilogue .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_blst_sha256_block_data_order_shaext_prologue:: DB 1,4,6,005h DB 4,074h,2,0 DB 4,064h,3,0 DB 4,053h DB 1,050h DD 0,0 $L$SEH_info_blst_sha256_block_data_order_shaext_body:: DB 1,0,17,85 DB 000h,068h,000h,000h DB 000h,078h,001h,000h DB 000h,088h,002h,000h DB 000h,098h,003h,000h DB 000h,0a8h,004h,000h DB 000h,074h,00ch,000h DB 000h,064h,00dh,000h DB 000h,053h DB 000h,092h DB 000h,050h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h $L$SEH_info_blst_sha256_block_data_order_prologue:: DB 1,4,6,005h DB 4,074h,2,0 DB 4,064h,3,0 DB 4,053h DB 1,050h DD 0,0 $L$SEH_info_blst_sha256_block_data_order_body:: DB 1,0,25,133 DB 000h,068h,000h,000h DB 000h,078h,001h,000h DB 000h,088h,002h,000h DB 000h,098h,003h,000h DB 000h,0f4h,00bh,000h DB 000h,0e4h,00ch,000h DB 000h,0d4h,00dh,000h DB 000h,0c4h,00eh,000h DB 000h,034h,00fh,000h DB 000h,074h,012h,000h DB 000h,064h,013h,000h DB 000h,053h DB 000h,0f2h DB 000h,050h DB 000h,000h,000h,000h,000h,000h DB 000h,000h,000h,000h $L$SEH_info_blst_sha256_block_data_order_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h DB 000h,000h,000h,000h .xdata ENDS END ================================================ FILE: build.bat ================================================ @echo off SETLOCAL set PATH=%windir%\system32;%PATH% &:: override msys if it's on the PATH set TOP=%~dp0 set CFLAGS=/nologo /c /O2 /Zi /Fdblst.pdb /W4 cl 2>&1 | find "for ARM64" > nul: IF ERRORLEVEL 1 ( set arm64=no FOR %%F IN (%TOP%build\win64\*-x86_64.asm) DO ( ml64 /nologo /c /Cp /Cx /Zi %%F || EXIT /B ) ) ELSE ( set arm64=yes FOR %%F IN (%TOP%build\win64\*-armv8.asm) DO ( armasm64 -nologo %%F || EXIT /B ) ) SETLOCAL ENABLEDELAYEDEXPANSION set static=/out:blst.lib set shared= set arm64x= FOR %%O IN (%*) DO ( set opt=%%O IF "!opt!" == "-shared" ( IF [!shared!] EQU [] set shared=/out:blst.dll ) ELSE IF "!opt!" == "-dll" ( IF [!shared!] EQU [] set shared=/out:blst.dll ) ELSE IF "!opt:~0,5!" == "/out:" ( IF "!opt:~-4!" == ".dll" (set shared=!opt!) ELSE (set static=!opt!) ) ELSE IF "!opt!" == "-arm64x" ( set arm64x=%arm64% ) ) IF [%shared%] NEQ [] ( cl %CFLAGS% /MD /D__BLST_DLL_MAIN__ %TOP%src\server.c || EXIT /B set ld= FOR /F "usebackq delims=" %%F IN (`where link`) DO ( IF "!ld!" == "" ( "%%F" 2>&1 | find "Linker" > nul: IF !ERRORLEVEL! EQU 0 set ld="%%F" ) ) IF [%arm64x%] NEQ [yes] ( !ld! /nologo /debug /dll /entry:DllMain /incremental:no %shared% ^ /def:%TOP%build\win64\blst.def *.obj kernel32.lib && del *.obj ) ELSE ( lib /nologo /out:blst_arm64.lib *.obj && del *.obj || EXIT /B FOR %%F IN (%TOP%build\win64\*-armv8.asm) DO ( armasm64 -nologo -machine arm64ec -nowarn %%F || EXIT /B ) cl /arm64EC %CFLAGS% /MD /D__BLST_DLL_MAIN__ %TOP%src\server.c || EXIT /B !ld! /nologo /machine:arm64x /dll /noentry %shared% ^ /def:%TOP%build\win64\blst.def *.obj ^ /defArm64Native:%TOP%build\win64\blst.def blst_arm64.lib ^ kernel32.lib && del *.obj blst_arm64.lib ) ) ELSE ( cl %CFLAGS% /MT /Zl %TOP%src\server.c || EXIT /B lib /nologo %static% *.obj && del *.obj ) ENDLOCAL EXIT /B ================================================ FILE: build.sh ================================================ #!/bin/sh set -e # # The script allows to override 'CC', 'CFLAGS' and 'flavour' at command # line, as well as specify additional compiler flags. For example to # compile for x32: # # /some/where/build.sh flavour=elf32 -mx32 # # To cross-compile for mingw/Windows: # # /some/where/build.sh flavour=mingw64 CC=x86_64-w64-mingw32-gcc # # In addition script recognizes -shared flag and creates shared library # alongside libblst.a. # # To cross-compile for WebAssembly with Emscripten SDK: # # /some/where/build.sh CROSS_COMPILE=em [ -d /usr/xpg4/bin ] && PATH=/usr/xpg4/bin:$PATH # Solaris TOP=`dirname $0` # if -Werror stands in the way, bypass with -Wno-error on command line, # or suppress specific one with -Wno- CFLAGS=${CFLAGS:--O2 -fno-builtin -fPIC -Wall -Wextra -Werror} PERL=${PERL:-perl} unset cflags shared dll while [ "x$1" != "x" ]; do case $1 in -shared) shared=1;; -dll) shared=1; dll=".dll";; -m*) CFLAGS="$CFLAGS $1";; -target|-arch) if expr "$CFLAGS" : ".*-arch " >/dev/null; then cflags="$cflags $1 $2" else CFLAGS="$CFLAGS $1 $2" fi shift;; -*target*) CFLAGS="$CFLAGS $1";; -*) cflags="$cflags $1";; *=*) eval "$1";; esac shift done if [ "x$CC" = "x" ]; then CC=gcc which ${CROSS_COMPILE}cc >/dev/null 2>&1 && CC=cc fi if which ${CROSS_COMPILE}${CC} >/dev/null 2>&1; then CC=${CROSS_COMPILE}${CC} fi if [ "x$CROSS_COMPILE" = "x" ]; then CROSS_COMPILE=`echo $CC | awk '{ print substr($1,0,match($1,"-(g?cc|clang)$")) }' 2>/dev/null` # fix up android prefix... CROSS_COMPILE=`echo $CROSS_COMPILE | awk '{ off=match($1,"-android[0-9]+-"); if (off) { printf "%sandroid-\n",substr($1,0,off) } else { print $1 } }'` fi predefs=`(${CC} ${CFLAGS} -dM -E -x c /dev/null || true) 2>/dev/null` if [ -z "${CROSS_COMPILE}${AR}" ] && echo ${predefs} | grep -q clang; then search_dirs=`${CC} -print-search-dirs | awk -F= '/^programs:/{print$2}' | \ (sed -E -e 's/([a-z]):\\\/\/\1\//gi' -e 'y/\\\;/\/:/' 2>/dev/null || true)` if [ -n "$search_dirs" ] && \ env PATH="$search_dirs:$PATH" which llvm-ar > /dev/null 2>&1; then PATH="$search_dirs:$PATH" AR=llvm-ar RANLIB=llvm-ranlib fi fi AR=${AR:-${CROSS_COMPILE}ar} RANLIB=${RANLIB:-${CROSS_COMPILE}ranlib} if [ -z "${flavour}" ]; then if echo ${predefs} | grep -q __APPLE__; then flavour=macosx elif echo ${predefs} | grep -q _WIN32; then flavour=mingw64 if [ $shared ]; then cflags="$cflags -D__BLST_DLL_MAIN__" fi else flavour=elf fi fi if echo ${predefs} | grep -q x86_64; then case `uname -s` in Darwin) if [ "`sysctl -n hw.optional.adx 2>/dev/null`" = "1" ]; then cflags="-D__ADX__ $cflags" fi;; *) if (grep -q -e '^flags.*\badx\b' /proc/cpuinfo) 2>/dev/null; then cflags="-D__ADX__ $cflags" fi;; esac fi if echo ${predefs} | grep -q __AVX__; then cflags="$cflags -mno-avx" # avoid costly transitions fi if echo ${predefs} | grep -E -q 'x86_64|aarch64'; then :; else cflags="$cflags -D__BLST_NO_ASM__" fi CFLAGS="$CFLAGS $cflags" TMPDIR=${TMPDIR:-/tmp} rm -f libblst.a trap '[ $? -ne 0 ] && rm -f libblst.a; rm -f *.o ${TMPDIR}/*.blst.$$' 0 (set -x; ${CC} ${CFLAGS} -c ${TOP}/src/server.c) (set -x; ${CC} ${CFLAGS} -c ${TOP}/build/assembly.S) (set -x; ${AR} rc libblst.a *.o) which ${RANLIB} > /dev/null 2>&1 && (set -x; ${RANLIB} libblst.a) if [ $shared ]; then case $flavour in macosx) (set -x; ${CC} -dynamiclib -o libblst$dll.dylib \ -all_load libblst.a ${CFLAGS}); exit 0;; mingw*) sharedlib="blst.dll ${TOP}/build/win64/blst.def" CFLAGS="${CFLAGS} -Wl,--entry=DllMain -nostartfiles";; *) sharedlib=libblst$dll.so CFLAGS="${CFLAGS} -Wl,-Bsymbolic";; esac (set -x; ${CC} -shared -o $sharedlib \ -Wl,--whole-archive,libblst.a,--no-whole-archive ${CFLAGS}) fi ================================================ FILE: build.zig ================================================ const std = @import("std"); pub fn build(b: *std.Build) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); const mod = b.addModule("blst", .{ .root_source_file = b.path("bindings/zig/blst.zig"), .target = target, .optimize = optimize, }); const lib = b.addLibrary(.{ .name = "blst", .linkage = .static, .root_module = mod, }); const cfiles = &[_][]const u8{ "src/server.c", "build/assembly.S", }; const cflags = &[_][]const u8{ "-O2", "-ffreestanding", "-D__BLST_PORTABLE__", "-D__BLST_NO_ASM__", }; switch (target.result.cpu.arch) { .aarch64, .x86_64 => lib.addCSourceFiles(.{ .files = cfiles, .flags = cflags[0 .. cflags.len-1], }), else => lib.addCSourceFiles(.{ .files = cfiles[0 .. cfiles.len-1], .flags = cflags, }), } if (target.result.os.tag == .windows) { lib.linkLibC(); } const tests = b.addTest(.{ .root_module = b.createModule(.{ .root_source_file = b.path("bindings/zig/tests.zig"), .target = target, .optimize = optimize, .imports = &.{.{ .name = "blst", .module = mod }}, }), }); b.step("test", "Run test[s]").dependOn(&b.addRunArtifact(tests).step); } ================================================ FILE: build.zig.zon ================================================ .{ .name = .blst, .version = "0.3.16", .minimum_zig_version = "0.14.0", .paths = .{ "build.zig", "build.zig.zon", "bindings/zig", "src", "build", }, .fingerprint = 0xa2dc4dc0d564fc7e, } ================================================ FILE: src/aggregate.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ /* * Usage pattern on single-processor system is * * blst_pairing_init(ctx, hash_or_encode, DST); * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]); * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]); * ... * blst_pairing_commit(ctx); * blst_pairing_finalverify(ctx, NULL); * *********************************************************************** * Usage pattern on multi-processor system is * * blst_pairing_init(pk[0], hash_or_encode, DST); * blst_pairing_init(pk[1], hash_or_encode, DST); * ... * start threads each processing an N/nthreads slice of PKs and messages: * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]); * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]); * ... * blst_pairing_commit(pkx); * ... * meanwhile in main thread * blst_fp12 gtsig; * blst_aggregated_in_g2(>sig, aggregated_signature); * join threads and merge their contexts: * blst_pairing_merge(pk[0], pk[1]); * blst_pairing_merge(pk[0], pk[2]); * ... * blst_pairing_finalverify(pk[0], gtsig); */ #ifndef N_MAX # define N_MAX 8 #endif typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature; typedef struct { unsigned int ctrl; unsigned int nelems; const void *DST; size_t DST_len; vec384fp12 GT; AggregatedSignature AggrSign; POINTonE2_affine Q[N_MAX]; POINTonE1_affine P[N_MAX]; } PAIRING; enum { AGGR_UNDEFINED = 0, AGGR_MIN_SIG = 1, AGGR_MIN_PK = 2, AGGR_SIGN_SET = 0x10, AGGR_GT_SET = 0x20, AGGR_HASH_OR_ENCODE = 0x40 }; #define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK) static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7; size_t blst_pairing_sizeof(void) { return sizeof_pairing; } void blst_pairing_init(PAIRING *ctx, int hash_or_encode, const void *DST, size_t DST_len) { ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); ctx->nelems = 0; ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42 : DST; ctx->DST_len = DST_len; } static const void *pairing_get_dst(const PAIRING *ctx) { return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing : ctx->DST; } const void *blst_pairing_get_dst(const PAIRING *ctx) { return pairing_get_dst(ctx); } #define FROM_AFFINE(out,in) do { \ vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ vec_is_zero(in->X, 2*sizeof(in->X))); } while(0) /* * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated * signature verification as discussed at * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. * Usage pattern is not finalized yet, because (sig != NULL) is better and * will be handled separately... */ static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, const POINTonE2_affine *PK, size_t pk_groupcheck, const POINTonE1_affine *sig, size_t sig_groupcheck, const byte *scalar, size_t nbits, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { if (ctx->ctrl & AGGR_MIN_PK) return BLST_AGGR_TYPE_MISMATCH; ctx->ctrl |= AGGR_MIN_SIG; /* * Since we don't know if the signature is individual or aggregated, * the only sensible thing to do is to skip over infinite one and * count on the corresponding infinite public key to be rejected, * in case the signature is non-aggregated that is. */ if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { POINTonE1 *S = &ctx->AggrSign.e1; POINTonE1 P[1]; FROM_AFFINE(P, sig); if (sig_groupcheck && !POINTonE1_in_G1(P)) return BLST_POINT_NOT_IN_GROUP; if (ctx->ctrl & AGGR_SIGN_SET) { if (nbits != 0 && scalar != NULL) { POINTonE1_mult_w5(P, P, scalar, nbits); POINTonE1_dadd(S, S, P, NULL); } else { POINTonE1_dadd_affine(S, S, sig); } } else { ctx->ctrl |= AGGR_SIGN_SET; if (nbits != 0 && scalar != NULL) POINTonE1_mult_w5(S, P, scalar, nbits); else vec_copy(S, P, sizeof(P)); } } if (PK != NULL) { unsigned int n; POINTonE1 H[1]; const void *DST = pairing_get_dst(ctx); /* * Reject infinite public keys. */ if (vec_is_zero(PK, sizeof(*PK))) return BLST_PK_IS_INFINITY; if (pk_groupcheck) { POINTonE2 P[1]; FROM_AFFINE(P, PK); if (!POINTonE2_in_G2(P)) return BLST_POINT_NOT_IN_GROUP; } if (ctx->ctrl & AGGR_HASH_OR_ENCODE) Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); else Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); if (nbits != 0 && scalar != NULL) POINTonE1_mult_w5(H, H, scalar, nbits); POINTonE1_from_Jacobian(H, H); n = ctx->nelems; vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); if (++n == N_MAX) { if (ctx->ctrl & AGGR_GT_SET) { vec384fp12 GT; miller_loop_n(GT, ctx->Q, ctx->P, n); mul_fp12(ctx->GT, ctx->GT, GT); } else { miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); ctx->ctrl |= AGGR_GT_SET; } n = 0; } ctx->nelems = n; } return BLST_SUCCESS; } BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, const POINTonE2_affine *PK, const POINTonE1_affine *signature, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0, msg, msg_len, aug, aug_len); } BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx, const POINTonE2_affine *PK, const POINTonE1_affine *sig, const byte *scalar, size_t nbits, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits, msg, msg_len, aug, aug_len); } BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx, const POINTonE2_affine *PK, size_t pk_grpchk, const POINTonE1_affine *signature, size_t sig_grpchk, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk, NULL, 0, msg, msg_len, aug, aug_len); } BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx, const POINTonE2_affine *PK, size_t pk_grpchk, const POINTonE1_affine *sig, size_t sig_grpchk, const byte *scalar, size_t nbits, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk, scalar, nbits, msg, msg_len, aug, aug_len); } static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, const POINTonE1_affine *PK, size_t pk_groupcheck, const POINTonE2_affine *sig, size_t sig_groupcheck, const byte *scalar, size_t nbits, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { if (ctx->ctrl & AGGR_MIN_SIG) return BLST_AGGR_TYPE_MISMATCH; ctx->ctrl |= AGGR_MIN_PK; /* * Since we don't know if the signature is individual or aggregated, * the only sensible thing to do is to skip over infinite one and * count on the corresponding infinite public key to be rejected, * in case the signature is non-aggregated that is. */ if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { POINTonE2 *S = &ctx->AggrSign.e2; POINTonE2 P[1]; FROM_AFFINE(P, sig); if (sig_groupcheck && !POINTonE2_in_G2(P)) return BLST_POINT_NOT_IN_GROUP; if (ctx->ctrl & AGGR_SIGN_SET) { if (nbits != 0 && scalar != NULL) { POINTonE2_mult_w5(P, P, scalar, nbits); POINTonE2_dadd(S, S, P, NULL); } else { POINTonE2_dadd_affine(S, S, sig); } } else { ctx->ctrl |= AGGR_SIGN_SET; if (nbits != 0 && scalar != NULL) POINTonE2_mult_w5(S, P, scalar, nbits); else vec_copy(S, P, sizeof(P)); } } if (PK != NULL) { unsigned int n; POINTonE2 H[1]; POINTonE1 pk[1]; const void *DST = pairing_get_dst(ctx); /* * Reject infinite public keys. */ if (vec_is_zero(PK, sizeof(*PK))) return BLST_PK_IS_INFINITY; if (pk_groupcheck) { POINTonE1 P[1]; FROM_AFFINE(P, PK); if (!POINTonE1_in_G1(P)) return BLST_POINT_NOT_IN_GROUP; } if (ctx->ctrl & AGGR_HASH_OR_ENCODE) Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); else Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); POINTonE2_from_Jacobian(H, H); if (nbits != 0 && scalar != NULL) { FROM_AFFINE(pk, PK); POINTonE1_mult_w5(pk, pk, scalar, nbits); POINTonE1_from_Jacobian(pk, pk); PK = (const POINTonE1_affine *)pk; } n = ctx->nelems; vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); if (++n == N_MAX) { if (ctx->ctrl & AGGR_GT_SET) { vec384fp12 GT; miller_loop_n(GT, ctx->Q, ctx->P, n); mul_fp12(ctx->GT, ctx->GT, GT); } else { miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); ctx->ctrl |= AGGR_GT_SET; } n = 0; } ctx->nelems = n; } return BLST_SUCCESS; } BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, const POINTonE1_affine *PK, const POINTonE2_affine *signature, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0, msg, msg_len, aug, aug_len); } BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx, const POINTonE1_affine *PK, const POINTonE2_affine *sig, const byte *scalar, size_t nbits, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits, msg, msg_len, aug, aug_len); } BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx, const POINTonE1_affine *PK, size_t pk_grpchk, const POINTonE2_affine *signature, size_t sig_grpchk, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk, NULL, 0, msg, msg_len, aug, aug_len); } BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx, const POINTonE1_affine *PK, size_t pk_grpchk, const POINTonE2_affine *sig, size_t sig_grpchk, const byte *scalar, size_t nbits, const void *msg, size_t msg_len, const void *aug, size_t aug_len) { return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk, scalar, nbits, msg, msg_len, aug, aug_len); } static void PAIRING_Commit(PAIRING *ctx) { unsigned int n; if ((n = ctx->nelems) != 0) { if (ctx->ctrl & AGGR_GT_SET) { vec384fp12 GT; miller_loop_n(GT, ctx->Q, ctx->P, n); mul_fp12(ctx->GT, ctx->GT, GT); } else { miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); ctx->ctrl |= AGGR_GT_SET; } ctx->nelems = 0; } } void blst_pairing_commit(PAIRING *ctx) { PAIRING_Commit(ctx); } BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) { if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0) return BLST_AGGR_TYPE_MISMATCH; /* context producers are expected to have called blst_pairing_commit */ if (ctx->nelems || ctx1->nelems) return BLST_AGGR_TYPE_MISMATCH; ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK; switch (ctx->ctrl & MIN_SIG_OR_PK) { case AGGR_MIN_SIG: if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, &ctx1->AggrSign.e1, NULL); } else if (ctx1->ctrl & AGGR_SIGN_SET) { ctx->ctrl |= AGGR_SIGN_SET; vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, sizeof(ctx->AggrSign.e1)); } break; case AGGR_MIN_PK: if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, &ctx1->AggrSign.e2, NULL); } else if (ctx1->ctrl & AGGR_SIGN_SET) { ctx->ctrl |= AGGR_SIGN_SET; vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, sizeof(ctx->AggrSign.e2)); } break; case AGGR_UNDEFINED: break; default: return BLST_AGGR_TYPE_MISMATCH; } if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) { mul_fp12(ctx->GT, ctx->GT, ctx1->GT); } else if (ctx1->ctrl & AGGR_GT_SET) { ctx->ctrl |= AGGR_GT_SET; vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); } return BLST_SUCCESS; } static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) { vec384fp12 GT; if (!(ctx->ctrl & AGGR_GT_SET)) return 0; if (GTsig != NULL) { vec_copy(GT, GTsig, sizeof(GT)); } else if (ctx->ctrl & AGGR_SIGN_SET) { AggregatedSignature AggrSign; switch (ctx->ctrl & MIN_SIG_OR_PK) { case AGGR_MIN_SIG: POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, (const POINTonE1_affine *)&AggrSign.e1, 1); break; case AGGR_MIN_PK: POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, (const POINTonE1_affine *)&BLS12_381_G1, 1); break; default: return 0; } } else { /* * The aggregated signature was infinite, relation between the * hashes and the public keys has to be VERY special... */ vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT)); } conjugate_fp12(GT); mul_fp12(GT, GT, ctx->GT); final_exp(GT, GT); /* return GT==1 */ return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); } int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) { return (int)PAIRING_FinalVerify(ctx, GTsig); } int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2) { vec384fp12 GT; vec_copy(GT, GT1, sizeof(GT)); conjugate_fp12(GT); mul_fp12(GT, GT, GT2); final_exp(GT, GT); /* return GT==1 */ return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]))); } void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q, const POINTonE1_affine *p) { unsigned int n; if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p))) return; n = ctx->nelems; vec_copy(ctx->Q + n, q, sizeof(*q)); vec_copy(ctx->P + n, p, sizeof(*p)); if (++n == N_MAX) { if (ctx->ctrl & AGGR_GT_SET) { vec384fp12 GT; miller_loop_n(GT, ctx->Q, ctx->P, n); mul_fp12(ctx->GT, ctx->GT, GT); } else { miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); ctx->ctrl |= AGGR_GT_SET; } n = 0; } ctx->nelems = n; } vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx) { PAIRING_Commit(ctx); return (vec384fp12 *)ctx->GT; } /* * PAIRING context-free entry points. * * To perform FastAggregateVerify, aggregate all public keys and * signatures with corresponding blst_aggregate_in_g{12}, convert * result to affine and call suitable blst_core_verify_pk_in_g{12} * or blst_aggregated_in_g{12}... */ BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, const unsigned char *zwire) { POINTonE1 P[1]; BLST_ERROR ret; ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire); if (ret != BLST_SUCCESS) return ret; if (vec_is_zero(P, sizeof(POINTonE1_affine))) { if (in == NULL) vec_zero(out, sizeof(*out)); return BLST_SUCCESS; } vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); if (!POINTonE1_in_G1(P)) return BLST_POINT_NOT_IN_GROUP; if (in == NULL) vec_copy(out, P, sizeof(P)); else POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P); return BLST_SUCCESS; } BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, const unsigned char *zwire) { POINTonE2 P[1]; BLST_ERROR ret; ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire); if (ret != BLST_SUCCESS) return ret; if (vec_is_zero(P, sizeof(POINTonE2_affine))) { if (in == NULL) vec_zero(out, sizeof(*out)); return BLST_SUCCESS; } vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); if (!POINTonE2_in_G2(P)) return BLST_POINT_NOT_IN_GROUP; if (in == NULL) { vec_copy(out, P, sizeof(P)); } else { POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P); } return BLST_SUCCESS; } void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) { miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) { miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, const POINTonE2_affine *signature, int hash_or_encode, const void *msg, size_t msg_len, const void *DST, size_t DST_len, const void *aug, size_t aug_len) { PAIRING ctx; BLST_ERROR ret; ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); ctx.nelems = 0; ctx.DST = DST; ctx.DST_len = DST_len; ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0, msg, msg_len, aug, aug_len); if (ret != BLST_SUCCESS) return ret; PAIRING_Commit(&ctx); return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; } BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, const POINTonE1_affine *signature, int hash_or_encode, const void *msg, size_t msg_len, const void *DST, size_t DST_len, const void *aug, size_t aug_len) { PAIRING ctx; BLST_ERROR ret; ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); ctx.nelems = 0; ctx.DST = DST; ctx.DST_len = DST_len; ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0, msg, msg_len, aug, aug_len); if (ret != BLST_SUCCESS) return ret; PAIRING_Commit(&ctx); return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; } ================================================ FILE: src/asm/add_mod_256-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } ($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); @mod=map("x$_",(4..7)); @a=map("x$_",(8..11)); @b=map("x$_",(12..15)); @t=map("x$_",(16,17,1..3)); $code.=<<___; .text .globl add_mod_256 .hidden add_mod_256 .type add_mod_256,%function .align 5 add_mod_256: hint #34 ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] adds @a[0],@a[0],@b[0] ldp @b[2],@b[3],[$b_ptr,#16] adcs @a[1],@a[1],@b[1] ldp @mod[0],@mod[1],[$n_ptr] adcs @a[2],@a[2],@b[2] ldp @mod[2],@mod[3],[$n_ptr,#16] adcs @a[3],@a[3],@b[3] adc @t[4],xzr,xzr subs @t[0],@a[0],@mod[0] sbcs @t[1],@a[1],@mod[1] sbcs @t[2],@a[2],@mod[2] sbcs @t[3],@a[3],@mod[3] sbcs xzr,@t[4],xzr csel @a[0],@a[0],@t[0],lo csel @a[1],@a[1],@t[1],lo csel @a[2],@a[2],@t[2],lo stp @a[0],@a[1],[$r_ptr] csel @a[3],@a[3],@t[3],lo stp @a[2],@a[3],[$r_ptr,#16] ret .size add_mod_256,.-add_mod_256 .globl mul_by_3_mod_256 .hidden mul_by_3_mod_256 .type mul_by_3_mod_256,%function .align 5 mul_by_3_mod_256: hint #34 ldp @b[0],@b[1],[$a_ptr] ldp @b[2],@b[3],[$a_ptr,#16] adds @a[0],@b[0],@b[0] ldp @mod[0],@mod[1],[$b_ptr] adcs @a[1],@b[1],@b[1] ldp @mod[2],@mod[3],[$b_ptr,#16] adcs @a[2],@b[2],@b[2] adcs @a[3],@b[3],@b[3] adc @t[4],xzr,xzr subs @t[0],@a[0],@mod[0] sbcs @t[1],@a[1],@mod[1] sbcs @t[2],@a[2],@mod[2] sbcs @t[3],@a[3],@mod[3] sbcs xzr,@t[4],xzr csel @a[0],@a[0],@t[0],lo csel @a[1],@a[1],@t[1],lo csel @a[2],@a[2],@t[2],lo csel @a[3],@a[3],@t[3],lo adds @a[0],@a[0],@b[0] adcs @a[1],@a[1],@b[1] adcs @a[2],@a[2],@b[2] adcs @a[3],@a[3],@b[3] adc @t[4],xzr,xzr subs @t[0],@a[0],@mod[0] sbcs @t[1],@a[1],@mod[1] sbcs @t[2],@a[2],@mod[2] sbcs @t[3],@a[3],@mod[3] sbcs xzr,@t[4],xzr csel @a[0],@a[0],@t[0],lo csel @a[1],@a[1],@t[1],lo csel @a[2],@a[2],@t[2],lo stp @a[0],@a[1],[$r_ptr] csel @a[3],@a[3],@t[3],lo stp @a[2],@a[3],[$r_ptr,#16] ret .size mul_by_3_mod_256,.-mul_by_3_mod_256 .globl lshift_mod_256 .hidden lshift_mod_256 .type lshift_mod_256,%function .align 5 lshift_mod_256: hint #34 ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] .Loop_lshift_mod_256: adds @a[0],@a[0],@a[0] sub $b_ptr,$b_ptr,#1 adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adc @t[4],xzr,xzr subs @b[0],@a[0],@mod[0] sbcs @b[1],@a[1],@mod[1] sbcs @b[2],@a[2],@mod[2] sbcs @b[3],@a[3],@mod[3] sbcs xzr,@t[4],xzr csel @a[0],@a[0],@b[0],lo csel @a[1],@a[1],@b[1],lo csel @a[2],@a[2],@b[2],lo csel @a[3],@a[3],@b[3],lo cbnz $b_ptr,.Loop_lshift_mod_256 stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] ret .size lshift_mod_256,.-lshift_mod_256 .globl rshift_mod_256 .hidden rshift_mod_256 .type rshift_mod_256,%function .align 5 rshift_mod_256: hint #34 ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] .Loop_rshift: adds @b[0],@a[0],@mod[0] sub $b_ptr,$b_ptr,#1 adcs @b[1],@a[1],@mod[1] adcs @b[2],@a[2],@mod[2] adcs @b[3],@a[3],@mod[3] adc @t[4],xzr,xzr tst @a[0],#1 csel @b[0],@b[0],@a[0],ne csel @b[1],@b[1],@a[1],ne csel @b[2],@b[2],@a[2],ne csel @b[3],@b[3],@a[3],ne csel @t[4],@t[4],xzr,ne extr @a[0],@b[1],@b[0],#1 extr @a[1],@b[2],@b[1],#1 extr @a[2],@b[3],@b[2],#1 extr @a[3],@t[4],@b[3],#1 cbnz $b_ptr,.Loop_rshift stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] ret .size rshift_mod_256,.-rshift_mod_256 .globl cneg_mod_256 .hidden cneg_mod_256 .type cneg_mod_256,%function .align 5 cneg_mod_256: ldp @a[0],@a[1],[$a_ptr] ldp @mod[0],@mod[1],[$n_ptr] ldp @a[2],@a[3],[$a_ptr,#16] subs @b[0],@mod[0],@a[0] ldp @mod[2],@mod[3],[$n_ptr,#16] orr @mod[0],@a[0],@a[1] sbcs @b[1],@mod[1],@a[1] orr @mod[1],@a[2],@a[3] sbcs @b[2],@mod[2],@a[2] orr @t[4],@mod[0],@mod[1] sbc @b[3],@mod[3],@a[3] cmp @t[4],#0 csetm @t[4],ne ands $b_ptr,$b_ptr,@t[4] csel @a[0],@a[0],@b[0],eq csel @a[1],@a[1],@b[1],eq csel @a[2],@a[2],@b[2],eq stp @a[0],@a[1],[$r_ptr] csel @a[3],@a[3],@b[3],eq stp @a[2],@a[3],[$r_ptr,#16] ret .size cneg_mod_256,.-cneg_mod_256 .globl sub_mod_256 .hidden sub_mod_256 .type sub_mod_256,%function .align 5 sub_mod_256: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] subs @a[0],@a[0],@b[0] ldp @b[2],@b[3],[$b_ptr,#16] sbcs @a[1],@a[1],@b[1] ldp @mod[0],@mod[1],[$n_ptr] sbcs @a[2],@a[2],@b[2] ldp @mod[2],@mod[3],[$n_ptr,#16] sbcs @a[3],@a[3],@b[3] sbc @t[4],xzr,xzr and @mod[0],@mod[0],@t[4] and @mod[1],@mod[1],@t[4] adds @a[0],@a[0],@mod[0] and @mod[2],@mod[2],@t[4] adcs @a[1],@a[1],@mod[1] and @mod[3],@mod[3],@t[4] adcs @a[2],@a[2],@mod[2] stp @a[0],@a[1],[$r_ptr] adc @a[3],@a[3],@mod[3] stp @a[2],@a[3],[$r_ptr,#16] ret .size sub_mod_256,.-sub_mod_256 .globl check_mod_256 .hidden check_mod_256 .type check_mod_256,%function .align 5 check_mod_256: ldp @a[0],@a[1],[$r_ptr] ldp @a[2],@a[3],[$r_ptr,#16] ldp @mod[0],@mod[1],[$a_ptr] ldp @mod[2],@mod[3],[$a_ptr,#16] #ifdef __AARCH64EB__ rev @a[0],@a[0] rev @a[1],@a[1] rev @a[2],@a[2] rev @a[3],@a[3] #endif subs xzr,@a[0],@mod[0] sbcs xzr,@a[1],@mod[1] orr @a[0],@a[0],@a[1] sbcs xzr,@a[2],@mod[2] orr @a[0],@a[0],@a[2] sbcs xzr,@a[3],@mod[3] orr @a[0],@a[0],@a[3] sbc $a_ptr,xzr,xzr cmp @a[0],#0 mov x0,#1 csel x0,x0,xzr,ne and x0,x0,$a_ptr ret .size check_mod_256,.-check_mod_256 .globl add_n_check_mod_256 .hidden add_n_check_mod_256 .type add_n_check_mod_256,%function .align 5 add_n_check_mod_256: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @b[2],@b[3],[$b_ptr,#16] #ifdef __AARCH64EB__ rev @a[0],@a[0] rev @b[0],@b[0] rev @a[1],@a[1] rev @b[1],@b[1] rev @a[2],@a[2] rev @b[2],@b[2] rev @a[3],@a[3] rev @b[3],@b[3] #endif adds @a[0],@a[0],@b[0] ldp @mod[0],@mod[1],[$n_ptr] adcs @a[1],@a[1],@b[1] ldp @mod[2],@mod[3],[$n_ptr,#16] adcs @a[2],@a[2],@b[2] adcs @a[3],@a[3],@b[3] adc @t[4],xzr,xzr subs @t[0],@a[0],@mod[0] sbcs @t[1],@a[1],@mod[1] sbcs @t[2],@a[2],@mod[2] sbcs @t[3],@a[3],@mod[3] sbcs xzr,@t[4],xzr csel @a[0],@a[0],@t[0],lo csel @a[1],@a[1],@t[1],lo csel @a[2],@a[2],@t[2],lo csel @a[3],@a[3],@t[3],lo orr @t[0], @a[0], @a[1] orr @t[1], @a[2], @a[3] orr @t[0], @t[0], @t[1] #ifdef __AARCH64EB__ rev @a[0],@a[0] rev @a[1],@a[1] rev @a[2],@a[2] rev @a[3],@a[3] #endif stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] mov @t[1], #1 cmp @t[0], #0 csel x0, @t[1], xzr, ne ret .size add_n_check_mod_256,.-add_n_check_mod_256 .globl sub_n_check_mod_256 .hidden sub_n_check_mod_256 .type sub_n_check_mod_256,%function .align 5 sub_n_check_mod_256: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @b[2],@b[3],[$b_ptr,#16] #ifdef __AARCH64EB__ rev @a[0],@a[0] rev @b[0],@b[0] rev @a[1],@a[1] rev @b[1],@b[1] rev @a[2],@a[2] rev @b[2],@b[2] rev @a[3],@a[3] rev @b[3],@b[3] #endif subs @a[0],@a[0],@b[0] sbcs @a[1],@a[1],@b[1] ldp @mod[0],@mod[1],[$n_ptr] sbcs @a[2],@a[2],@b[2] ldp @mod[2],@mod[3],[$n_ptr,#16] sbcs @a[3],@a[3],@b[3] sbc @t[4],xzr,xzr and @mod[0],@mod[0],@t[4] and @mod[1],@mod[1],@t[4] adds @a[0],@a[0],@mod[0] and @mod[2],@mod[2],@t[4] adcs @a[1],@a[1],@mod[1] and @mod[3],@mod[3],@t[4] adcs @a[2],@a[2],@mod[2] adc @a[3],@a[3],@mod[3] orr @t[0], @a[0], @a[1] orr @t[1], @a[2], @a[3] orr @t[0], @t[0], @t[1] #ifdef __AARCH64EB__ rev @a[0],@a[0] rev @a[1],@a[1] rev @a[2],@a[2] rev @a[3],@a[3] #endif stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] mov @t[1], #1 cmp @t[0], #0 csel x0, @t[1], xzr, ne ret .size sub_n_check_mod_256,.-sub_n_check_mod_256 ___ print $code; close STDOUT; ================================================ FILE: src/asm/add_mod_256-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; # common argument layout ($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx"); $b_ptr = "%rbx"; { ############################################################## 256 bits add my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12)); $code.=<<___; .text .globl add_mod_256 .hidden add_mod_256 .type add_mod_256,\@function,4,"unwind" .align 32 add_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] .Loaded_a_add_mod_256: add 8*0($b_org), @acc[0] adc 8*1($b_org), @acc[1] mov @acc[0], @acc[4] adc 8*2($b_org), @acc[2] mov @acc[1], @acc[5] adc 8*3($b_org), @acc[3] sbb $b_org, $b_org mov @acc[2], @acc[6] sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] mov @acc[3], @acc[7] sbb 8*3($n_ptr), @acc[3] sbb \$0, $b_org cmovc @acc[4], @acc[0] cmovc @acc[5], @acc[1] mov @acc[0], 8*0($r_ptr) cmovc @acc[6], @acc[2] mov @acc[1], 8*1($r_ptr) cmovc @acc[7], @acc[3] mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size add_mod_256,.-add_mod_256 ######################################################################## .globl mul_by_3_mod_256 .hidden mul_by_3_mod_256 .type mul_by_3_mod_256,\@function,3,"unwind" .align 32 mul_by_3_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 .cfi_end_prologue mov $b_org,$n_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov $a_ptr,$b_org mov 8*3($a_ptr), @acc[3] call __lshift_mod_256 mov 0(%rsp),%r12 .cfi_restore %r12 jmp .Loaded_a_add_mod_256 mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size mul_by_3_mod_256,.-mul_by_3_mod_256 .type __lshift_mod_256,\@abi-omnipotent .align 32 __lshift_mod_256: add @acc[0], @acc[0] adc @acc[1], @acc[1] mov @acc[0], @acc[4] adc @acc[2], @acc[2] mov @acc[1], @acc[5] adc @acc[3], @acc[3] sbb @acc[8], @acc[8] mov @acc[2], @acc[6] sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] mov @acc[3], @acc[7] sbb 8*3($n_ptr), @acc[3] sbb \$0, @acc[8] cmovc @acc[4], @acc[0] cmovc @acc[5], @acc[1] cmovc @acc[6], @acc[2] cmovc @acc[7], @acc[3] ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[4] .size __lshift_mod_256,.-__lshift_mod_256 ######################################################################## .globl lshift_mod_256 .hidden lshift_mod_256 .type lshift_mod_256,\@function,4,"unwind" .align 32 lshift_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] .Loop_lshift_mod_256: call __lshift_mod_256 dec %edx jnz .Loop_lshift_mod_256 mov @acc[0], 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov 0(%rsp),%r12 .cfi_restore %r12 mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size lshift_mod_256,.-lshift_mod_256 ######################################################################## .globl rshift_mod_256 .hidden rshift_mod_256 .type rshift_mod_256,\@function,4,"unwind" .align 32 rshift_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[7] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] .Loop_rshift_mod_256: mov @acc[7], @acc[0] and \$1, @acc[7] mov 8*0($n_ptr), @acc[4] neg @acc[7] mov 8*1($n_ptr), @acc[5] mov 8*2($n_ptr), @acc[6] and @acc[7], @acc[4] and @acc[7], @acc[5] and @acc[7], @acc[6] and 8*3($n_ptr), @acc[7] add @acc[4], @acc[0] adc @acc[5], @acc[1] adc @acc[6], @acc[2] adc @acc[7], @acc[3] sbb @acc[4], @acc[4] shr \$1, @acc[0] mov @acc[1], @acc[7] shr \$1, @acc[1] mov @acc[2], @acc[6] shr \$1, @acc[2] mov @acc[3], @acc[5] shr \$1, @acc[3] shl \$63, @acc[7] shl \$63, @acc[6] or @acc[0], @acc[7] shl \$63, @acc[5] or @acc[6], @acc[1] shl \$63, @acc[4] or @acc[5], @acc[2] or @acc[4], @acc[3] dec %edx jnz .Loop_rshift_mod_256 mov @acc[7], 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size rshift_mod_256,.-rshift_mod_256 ######################################################################## .globl cneg_mod_256 .hidden cneg_mod_256 .type cneg_mod_256,\@function,4,"unwind" .align 32 cneg_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[8] # load a[0:3] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov @acc[8], @acc[0] mov 8*3($a_ptr), @acc[3] or @acc[1], @acc[8] or @acc[2], @acc[8] or @acc[3], @acc[8] mov \$-1, @acc[7] mov 8*0($n_ptr), @acc[4] # load n[0:3] cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0 mov 8*1($n_ptr), @acc[5] mov 8*2($n_ptr), @acc[6] and @acc[8], @acc[4] # n[0:3] &= mask mov 8*3($n_ptr), @acc[7] and @acc[8], @acc[5] and @acc[8], @acc[6] and @acc[8], @acc[7] sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0 sbb @acc[1], @acc[5] sbb @acc[2], @acc[6] sbb @acc[3], @acc[7] or $b_org, $b_org # check condition flag cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3] cmovz @acc[1], @acc[5] mov @acc[4], 8*0($r_ptr) cmovz @acc[2], @acc[6] mov @acc[5], 8*1($r_ptr) cmovz @acc[3], @acc[7] mov @acc[6], 8*2($r_ptr) mov @acc[7], 8*3($r_ptr) mov 0(%rsp),%r12 .cfi_restore %r12 mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size cneg_mod_256,.-cneg_mod_256 ######################################################################## .globl sub_mod_256 .hidden sub_mod_256 .type sub_mod_256,\@function,4,"unwind" .align 32 sub_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] sub 8*0($b_org), @acc[0] mov 8*0($n_ptr), @acc[4] sbb 8*1($b_org), @acc[1] mov 8*1($n_ptr), @acc[5] sbb 8*2($b_org), @acc[2] mov 8*2($n_ptr), @acc[6] sbb 8*3($b_org), @acc[3] mov 8*3($n_ptr), @acc[7] sbb $b_org, $b_org and $b_org, @acc[4] and $b_org, @acc[5] and $b_org, @acc[6] and $b_org, @acc[7] add @acc[4], @acc[0] adc @acc[5], @acc[1] mov @acc[0], 8*0($r_ptr) adc @acc[6], @acc[2] mov @acc[1], 8*1($r_ptr) adc @acc[7], @acc[3] mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size sub_mod_256,.-sub_mod_256 ######################################################################## .globl check_mod_256 .hidden check_mod_256 .type check_mod_256,\@function,2,"unwind" .align 32 check_mod_256: .cfi_startproc #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($r_ptr), %rax mov 8*1($r_ptr), @acc[1] mov 8*2($r_ptr), @acc[2] mov 8*3($r_ptr), @acc[3] mov %rax, @acc[0] # see if it's zero or @acc[1], %rax or @acc[2], %rax or @acc[3], %rax sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow? sbb 8*1($a_ptr), @acc[1] sbb 8*2($a_ptr), @acc[2] sbb 8*3($a_ptr), @acc[3] sbb $a_ptr, $a_ptr mov \$1, %rdx cmp \$0, %rax cmovne %rdx, %rax and $a_ptr, %rax .cfi_epilogue ret .cfi_endproc .size check_mod_256,.-check_mod_256 ######################################################################## .globl add_n_check_mod_256 .hidden add_n_check_mod_256 .type add_n_check_mod_256,\@function,4,"unwind" .align 32 add_n_check_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] add 8*0($b_org), @acc[0] adc 8*1($b_org), @acc[1] mov @acc[0], @acc[4] adc 8*2($b_org), @acc[2] mov @acc[1], @acc[5] adc 8*3($b_org), @acc[3] sbb $b_org, $b_org mov @acc[2], @acc[6] sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] mov @acc[3], @acc[7] sbb 8*3($n_ptr), @acc[3] sbb \$0, $b_org cmovc @acc[4], @acc[0] cmovc @acc[5], @acc[1] mov @acc[0], 8*0($r_ptr) cmovc @acc[6], @acc[2] mov @acc[1], 8*1($r_ptr) cmovc @acc[7], @acc[3] mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) or @acc[1], @acc[0] or @acc[3], @acc[2] or @acc[2], @acc[0] mov \$1, %rax cmovz @acc[0], %rax mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size add_n_check_mod_256,.-add_n_check_mod_256 ######################################################################## .globl sub_n_check_mod_256 .hidden sub_n_check_mod_256 .type sub_n_check_mod_256,\@function,4,"unwind" .align 32 sub_n_check_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] sub 8*0($b_org), @acc[0] mov 8*0($n_ptr), @acc[4] sbb 8*1($b_org), @acc[1] mov 8*1($n_ptr), @acc[5] sbb 8*2($b_org), @acc[2] mov 8*2($n_ptr), @acc[6] sbb 8*3($b_org), @acc[3] mov 8*3($n_ptr), @acc[7] sbb $b_org, $b_org and $b_org, @acc[4] and $b_org, @acc[5] and $b_org, @acc[6] and $b_org, @acc[7] add @acc[4], @acc[0] adc @acc[5], @acc[1] mov @acc[0], 8*0($r_ptr) adc @acc[6], @acc[2] mov @acc[1], 8*1($r_ptr) adc @acc[7], @acc[3] mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) or @acc[1], @acc[0] or @acc[3], @acc[2] or @acc[2], @acc[0] mov \$1, %rax cmovz @acc[0], %rax mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size sub_n_check_mod_256,.-sub_n_check_mod_256 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/add_mod_384-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } ($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); @mod=map("x$_",(4..9)); @a=map("x$_",(10..15)); @b=map("x$_",(16,17,19..22)); $carry=$n_ptr; $code.=<<___; .text .globl add_mod_384 .hidden add_mod_384 .type add_mod_384,%function .align 5 add_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __add_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size add_mod_384,.-add_mod_384 .type __add_mod_384,%function .align 5 __add_mod_384: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @b[2],@b[3],[$b_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @b[4],@b[5],[$b_ptr,#32] __add_mod_384_ab_are_loaded: adds @a[0],@a[0],@b[0] adcs @a[1],@a[1],@b[1] adcs @a[2],@a[2],@b[2] adcs @a[3],@a[3],@b[3] adcs @a[4],@a[4],@b[4] adcs @a[5],@a[5],@b[5] adc $carry,xzr,xzr subs @b[0],@a[0],@mod[0] sbcs @b[1],@a[1],@mod[1] sbcs @b[2],@a[2],@mod[2] sbcs @b[3],@a[3],@mod[3] sbcs @b[4],@a[4],@mod[4] sbcs @b[5],@a[5],@mod[5] sbcs xzr,$carry,xzr csel @a[0],@a[0],@b[0],lo csel @a[1],@a[1],@b[1],lo csel @a[2],@a[2],@b[2],lo csel @a[3],@a[3],@b[3],lo csel @a[4],@a[4],@b[4],lo csel @a[5],@a[5],@b[5],lo ret .size __add_mod_384,.-__add_mod_384 .globl add_mod_384x .hidden add_mod_384x .type add_mod_384x,%function .align 5 add_mod_384x: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __add_mod_384 stp @a[0],@a[1],[$r_ptr] cadd $a_ptr,$a_ptr,#48 stp @a[2],@a[3],[$r_ptr,#16] cadd $b_ptr,$b_ptr,#48 stp @a[4],@a[5],[$r_ptr,#32] bl __add_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size add_mod_384x,.-add_mod_384x .globl rshift_mod_384 .hidden rshift_mod_384 .type rshift_mod_384,%function .align 5 rshift_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] .Loop_rshift_mod_384: sub $b_ptr,$b_ptr,#1 bl __rshift_mod_384 cbnz $b_ptr,.Loop_rshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size rshift_mod_384,.-rshift_mod_384 .type __rshift_mod_384,%function .align 5 __rshift_mod_384: sbfx @b[5],@a[0],#0,#1 and @b[0],@b[5],@mod[0] and @b[1],@b[5],@mod[1] adds @a[0],@a[0],@b[0] and @b[2],@b[5],@mod[2] adcs @a[1],@a[1],@b[1] and @b[3],@b[5],@mod[3] adcs @a[2],@a[2],@b[2] and @b[4],@b[5],@mod[4] adcs @a[3],@a[3],@b[3] and @b[5],@b[5],@mod[5] adcs @a[4],@a[4],@b[4] extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1 adcs @a[5],@a[5],@b[5] extr @a[1],@a[2],@a[1],#1 adc @b[5],xzr,xzr extr @a[2],@a[3],@a[2],#1 extr @a[3],@a[4],@a[3],#1 extr @a[4],@a[5],@a[4],#1 extr @a[5],@b[5],@a[5],#1 ret .size __rshift_mod_384,.-__rshift_mod_384 .globl div_by_2_mod_384 .hidden div_by_2_mod_384 .type div_by_2_mod_384,%function .align 5 div_by_2_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __rshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size div_by_2_mod_384,.-div_by_2_mod_384 .globl lshift_mod_384 .hidden lshift_mod_384 .type lshift_mod_384,%function .align 5 lshift_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] .Loop_lshift_mod_384: sub $b_ptr,$b_ptr,#1 bl __lshift_mod_384 cbnz $b_ptr,.Loop_lshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size lshift_mod_384,.-lshift_mod_384 .type __lshift_mod_384,%function .align 5 __lshift_mod_384: adds @a[0],@a[0],@a[0] adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc $carry,xzr,xzr subs @b[0],@a[0],@mod[0] sbcs @b[1],@a[1],@mod[1] sbcs @b[2],@a[2],@mod[2] sbcs @b[3],@a[3],@mod[3] sbcs @b[4],@a[4],@mod[4] sbcs @b[5],@a[5],@mod[5] sbcs xzr,$carry,xzr csel @a[0],@a[0],@b[0],lo csel @a[1],@a[1],@b[1],lo csel @a[2],@a[2],@b[2],lo csel @a[3],@a[3],@b[3],lo csel @a[4],@a[4],@b[4],lo csel @a[5],@a[5],@b[5],lo ret .size __lshift_mod_384,.-__lshift_mod_384 .globl mul_by_3_mod_384 .hidden mul_by_3_mod_384 .type mul_by_3_mod_384,%function .align 5 mul_by_3_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 ldp @b[0],@b[1],[$a_ptr] ldp @b[2],@b[3],[$a_ptr,#16] ldp @b[4],@b[5],[$a_ptr,#32] bl __add_mod_384_ab_are_loaded ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size mul_by_3_mod_384,.-mul_by_3_mod_384 .globl mul_by_8_mod_384 .hidden mul_by_8_mod_384 .type mul_by_8_mod_384,%function .align 5 mul_by_8_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size mul_by_8_mod_384,.-mul_by_8_mod_384 .globl mul_by_3_mod_384x .hidden mul_by_3_mod_384x .type mul_by_3_mod_384x,%function .align 5 mul_by_3_mod_384x: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 ldp @b[0],@b[1],[$a_ptr] ldp @b[2],@b[3],[$a_ptr,#16] ldp @b[4],@b[5],[$a_ptr,#32] bl __add_mod_384_ab_are_loaded stp @a[0],@a[1],[$r_ptr] ldp @a[0],@a[1],[$a_ptr,#48] stp @a[2],@a[3],[$r_ptr,#16] ldp @a[2],@a[3],[$a_ptr,#64] stp @a[4],@a[5],[$r_ptr,#32] ldp @a[4],@a[5],[$a_ptr,#80] bl __lshift_mod_384 ldp @b[0],@b[1],[$a_ptr,#48] ldp @b[2],@b[3],[$a_ptr,#64] ldp @b[4],@b[5],[$a_ptr,#80] bl __add_mod_384_ab_are_loaded ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size mul_by_3_mod_384x,.-mul_by_3_mod_384x .globl mul_by_8_mod_384x .hidden mul_by_8_mod_384x .type mul_by_8_mod_384x,%function .align 5 mul_by_8_mod_384x: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 stp @a[0],@a[1],[$r_ptr] ldp @a[0],@a[1],[$a_ptr,#48] stp @a[2],@a[3],[$r_ptr,#16] ldp @a[2],@a[3],[$a_ptr,#64] stp @a[4],@a[5],[$r_ptr,#32] ldp @a[4],@a[5],[$a_ptr,#80] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size mul_by_8_mod_384x,.-mul_by_8_mod_384x .globl cneg_mod_384 .hidden cneg_mod_384 .type cneg_mod_384,%function .align 5 cneg_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @mod[0],@mod[1],[$n_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @mod[2],@mod[3],[$n_ptr,#16] subs @b[0],@mod[0],@a[0] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[4],@mod[5],[$n_ptr,#32] orr $carry,@a[0],@a[1] sbcs @b[1],@mod[1],@a[1] orr $carry,$carry,@a[2] sbcs @b[2],@mod[2],@a[2] orr $carry,$carry,@a[3] sbcs @b[3],@mod[3],@a[3] orr $carry,$carry,@a[4] sbcs @b[4],@mod[4],@a[4] orr $carry,$carry,@a[5] sbc @b[5],@mod[5],@a[5] cmp $carry,#0 csetm $carry,ne ands $b_ptr,$b_ptr,$carry csel @a[0],@a[0],@b[0],eq csel @a[1],@a[1],@b[1],eq csel @a[2],@a[2],@b[2],eq csel @a[3],@a[3],@b[3],eq stp @a[0],@a[1],[$r_ptr] csel @a[4],@a[4],@b[4],eq stp @a[2],@a[3],[$r_ptr,#16] csel @a[5],@a[5],@b[5],eq stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size cneg_mod_384,.-cneg_mod_384 .globl sub_mod_384 .hidden sub_mod_384 .type sub_mod_384,%function .align 5 sub_mod_384: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __sub_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size sub_mod_384,.-sub_mod_384 .type __sub_mod_384,%function .align 5 __sub_mod_384: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @b[2],@b[3],[$b_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @b[4],@b[5],[$b_ptr,#32] subs @a[0],@a[0],@b[0] sbcs @a[1],@a[1],@b[1] sbcs @a[2],@a[2],@b[2] sbcs @a[3],@a[3],@b[3] sbcs @a[4],@a[4],@b[4] sbcs @a[5],@a[5],@b[5] sbc $carry,xzr,xzr and @b[0],@mod[0],$carry and @b[1],@mod[1],$carry adds @a[0],@a[0],@b[0] and @b[2],@mod[2],$carry adcs @a[1],@a[1],@b[1] and @b[3],@mod[3],$carry adcs @a[2],@a[2],@b[2] and @b[4],@mod[4],$carry adcs @a[3],@a[3],@b[3] and @b[5],@mod[5],$carry adcs @a[4],@a[4],@b[4] adc @a[5],@a[5],@b[5] ret .size __sub_mod_384,.-__sub_mod_384 .globl sub_mod_384x .hidden sub_mod_384x .type sub_mod_384x,%function .align 5 sub_mod_384x: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __sub_mod_384 stp @a[0],@a[1],[$r_ptr] cadd $a_ptr,$a_ptr,#48 stp @a[2],@a[3],[$r_ptr,#16] cadd $b_ptr,$b_ptr,#48 stp @a[4],@a[5],[$r_ptr,#32] bl __sub_mod_384 ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size sub_mod_384x,.-sub_mod_384x .globl mul_by_1_plus_i_mod_384x .hidden mul_by_1_plus_i_mod_384x .type mul_by_1_plus_i_mod_384x,%function .align 5 mul_by_1_plus_i_mod_384x: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] cadd $b_ptr,$a_ptr,#48 bl __sub_mod_384 // a->re - a->im ldp @b[0],@b[1],[$a_ptr] ldp @b[2],@b[3],[$a_ptr,#16] ldp @b[4],@b[5],[$a_ptr,#32] stp @a[0],@a[1],[$r_ptr] ldp @a[0],@a[1],[$a_ptr,#48] stp @a[2],@a[3],[$r_ptr,#16] ldp @a[2],@a[3],[$a_ptr,#64] stp @a[4],@a[5],[$r_ptr,#32] ldp @a[4],@a[5],[$a_ptr,#80] bl __add_mod_384_ab_are_loaded // a->re + a->im ldr c30,[csp,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x .globl sgn0_pty_mod_384 .hidden sgn0_pty_mod_384 .type sgn0_pty_mod_384,%function .align 5 sgn0_pty_mod_384: hint #34 ldp @a[0],@a[1],[$r_ptr] ldp @a[2],@a[3],[$r_ptr,#16] ldp @a[4],@a[5],[$r_ptr,#32] ldp @mod[0],@mod[1],[$a_ptr] ldp @mod[2],@mod[3],[$a_ptr,#16] ldp @mod[4],@mod[5],[$a_ptr,#32] and $r_ptr,@a[0],#1 adds @a[0],@a[0],@a[0] adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc $carry,xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc $carry,$carry,xzr mvn $carry,$carry and $carry,$carry,#2 orr $r_ptr,$r_ptr,$carry ret .size sgn0_pty_mod_384,.-sgn0_pty_mod_384 .globl sgn0_pty_mod_384x .hidden sgn0_pty_mod_384x .type sgn0_pty_mod_384x,%function .align 5 sgn0_pty_mod_384x: hint #34 ldp @a[0],@a[1],[$r_ptr] ldp @a[2],@a[3],[$r_ptr,#16] ldp @a[4],@a[5],[$r_ptr,#32] ldp @mod[0],@mod[1],[$a_ptr] ldp @mod[2],@mod[3],[$a_ptr,#16] ldp @mod[4],@mod[5],[$a_ptr,#32] and $b_ptr,@a[0],#1 orr $n_ptr,@a[0],@a[1] adds @a[0],@a[0],@a[0] orr $n_ptr,$n_ptr,@a[2] adcs @a[1],@a[1],@a[1] orr $n_ptr,$n_ptr,@a[3] adcs @a[2],@a[2],@a[2] orr $n_ptr,$n_ptr,@a[4] adcs @a[3],@a[3],@a[3] orr $n_ptr,$n_ptr,@a[5] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc @b[0],xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc @b[0],@b[0],xzr ldp @a[0],@a[1],[$r_ptr,#48] ldp @a[2],@a[3],[$r_ptr,#64] ldp @a[4],@a[5],[$r_ptr,#80] mvn @b[0],@b[0] and @b[0],@b[0],#2 orr $b_ptr,$b_ptr,@b[0] and $r_ptr,@a[0],#1 orr $a_ptr,@a[0],@a[1] adds @a[0],@a[0],@a[0] orr $a_ptr,$a_ptr,@a[2] adcs @a[1],@a[1],@a[1] orr $a_ptr,$a_ptr,@a[3] adcs @a[2],@a[2],@a[2] orr $a_ptr,$a_ptr,@a[4] adcs @a[3],@a[3],@a[3] orr $a_ptr,$a_ptr,@a[5] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc @b[0],xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc @b[0],@b[0],xzr mvn @b[0],@b[0] and @b[0],@b[0],#2 orr $r_ptr,$r_ptr,@b[0] cmp $n_ptr,#0 csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) cmp $a_ptr,#0 csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and $n_ptr,$n_ptr,#1 and $a_ptr,$a_ptr,#2 orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity ret .size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x ___ if (1) { sub vec_select { my $sz = shift; my @v=map("v$_",(0..5,16..21)); $code.=<<___; .globl vec_select_$sz .hidden vec_select_$sz .type vec_select_$sz,%function .align 5 vec_select_$sz: hint #34 dup v6.2d, $n_ptr ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48 cmeq v6.2d, v6.2d, #0 ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48 ___ for($i=0; $i<$sz-48; $i+=48) { $code.=<<___; bit @v[0].16b, @v[3].16b, v6.16b ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48 bit @v[1].16b, @v[4].16b, v6.16b ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48 bit @v[2].16b, @v[5].16b, v6.16b st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48 ___ @v = @v[6..11,0..5]; } $code.=<<___; bit @v[0].16b, @v[3].16b, v6.16b bit @v[1].16b, @v[4].16b, v6.16b bit @v[2].16b, @v[5].16b, v6.16b st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr] ret .size vec_select_$sz,.-vec_select_$sz ___ } $code.=<<___; .globl vec_select_32 .hidden vec_select_32 .type vec_select_32,%function .align 5 vec_select_32: hint #34 dup v6.2d, $n_ptr ld1 {v0.2d, v1.2d}, [$a_ptr] cmeq v6.2d, v6.2d, #0 ld1 {v3.2d, v4.2d}, [$b_ptr] bit v0.16b, v3.16b, v6.16b bit v1.16b, v4.16b, v6.16b st1 {v0.2d, v1.2d}, [$r_ptr] ret .size vec_select_32,.-vec_select_32 ___ vec_select(48); vec_select(96); vec_select(192); vec_select(144); vec_select(288); } { my ($inp, $end, $step) = map("x$_", (0..2)); $code.=<<___; .globl vec_prefetch .hidden vec_prefetch .type vec_prefetch,%function .align 5 vec_prefetch: hint #34 add $end, $end, $inp sub $end, $end, #1 mov $step, #64 prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi prfm pldl1keep, [$inp] ret .size vec_prefetch,.-vec_prefetch ___ my $len = $end; $code.=<<___; .globl vec_is_zero_16x .hidden vec_is_zero_16x .type vec_is_zero_16x,%function .align 5 vec_is_zero_16x: hint #34 ld1 {v0.2d}, [$inp], #16 lsr $len, $len, #4 sub $len, $len, #1 cbz $len, .Loop_is_zero_done .Loop_is_zero: ld1 {v1.2d}, [$inp], #16 orr v0.16b, v0.16b, v1.16b sub $len, $len, #1 cbnz $len, .Loop_is_zero .Loop_is_zero_done: dup v1.2d, v0.2d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.2d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .size vec_is_zero_16x,.-vec_is_zero_16x ___ } { my ($inp1, $inp2, $len) = map("x$_", (0..2)); $code.=<<___; .globl vec_is_equal_16x .hidden vec_is_equal_16x .type vec_is_equal_16x,%function .align 5 vec_is_equal_16x: hint #34 ld1 {v0.2d}, [$inp1], #16 ld1 {v1.2d}, [$inp2], #16 lsr $len, $len, #4 eor v0.16b, v0.16b, v1.16b .Loop_is_equal: sub $len, $len, #1 cbz $len, .Loop_is_equal_done ld1 {v1.2d}, [$inp1], #16 ld1 {v2.2d}, [$inp2], #16 eor v1.16b, v1.16b, v2.16b orr v0.16b, v0.16b, v1.16b b .Loop_is_equal nop .Loop_is_equal_done: dup v1.2d, v0.2d[1] orr v0.16b, v0.16b, v1.16b umov x1, v0.2d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq ret .size vec_is_equal_16x,.-vec_is_equal_16x ___ } print $code; close STDOUT; ================================================ FILE: src/asm/add_mod_384-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; # common argument layout ($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); $b_ptr = "%rbx"; { ############################################################## 384 bits add my @acc=map("%r$_",(8..15, "ax", "bx", "bp")); push(@acc, $a_ptr); $code.=<<___; .text .globl add_mod_384 .hidden add_mod_384 .type add_mod_384,\@function,4,"unwind" .align 32 add_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue call __add_mod_384 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size add_mod_384,.-add_mod_384 .type __add_mod_384,\@abi-omnipotent .align 32 __add_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] __add_mod_384_a_is_loaded: add 8*0($b_org), @acc[0] adc 8*1($b_org), @acc[1] adc 8*2($b_org), @acc[2] mov @acc[0], @acc[6] adc 8*3($b_org), @acc[3] mov @acc[1], @acc[7] adc 8*4($b_org), @acc[4] mov @acc[2], @acc[8] adc 8*5($b_org), @acc[5] mov @acc[3], @acc[9] sbb $b_org, $b_org sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[10] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[11] sbb 8*5($n_ptr), @acc[5] sbb \$0, $b_org cmovc @acc[6], @acc[0] cmovc @acc[7], @acc[1] cmovc @acc[8], @acc[2] mov @acc[0], 8*0($r_ptr) cmovc @acc[9], @acc[3] mov @acc[1], 8*1($r_ptr) cmovc @acc[10], @acc[4] mov @acc[2], 8*2($r_ptr) cmovc @acc[11], @acc[5] mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ret .size __add_mod_384,.-__add_mod_384 .globl add_mod_384x .hidden add_mod_384x .type add_mod_384x,\@function,4,"unwind" .align 32 add_mod_384x: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$24, %rsp .cfi_adjust_cfa_offset 24 .cfi_end_prologue mov $a_ptr, 8*0(%rsp) mov $b_org, 8*1(%rsp) lea 48($a_ptr), $a_ptr # a->im lea 48($b_org), $b_org # b->im lea 48($r_ptr), $r_ptr # ret->im call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod); mov 8*0(%rsp), $a_ptr # a->re mov 8*1(%rsp), $b_org # b->re lea -48($r_ptr), $r_ptr # ret->re call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod); mov 24+8*0(%rsp),%r15 .cfi_restore %r15 mov 24+8*1(%rsp),%r14 .cfi_restore %r14 mov 24+8*2(%rsp),%r13 .cfi_restore %r13 mov 24+8*3(%rsp),%r12 .cfi_restore %r12 mov 24+8*4(%rsp),%rbx .cfi_restore %rbx mov 24+8*5(%rsp),%rbp .cfi_restore %rbp lea 24+8*6(%rsp),%rsp .cfi_adjust_cfa_offset -24-8*6 .cfi_epilogue ret .cfi_endproc .size add_mod_384x,.-add_mod_384x ######################################################################## .globl rshift_mod_384 .hidden rshift_mod_384 .type rshift_mod_384,\@function,4,"unwind" .align 32 rshift_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $r_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] .Loop_rshift_mod_384: call __rshift_mod_384 dec %edx jnz .Loop_rshift_mod_384 mov @acc[0], 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size rshift_mod_384,.-rshift_mod_384 .type __rshift_mod_384,\@abi-omnipotent .align 32 __rshift_mod_384: mov \$1, @acc[11] mov 8*0($n_ptr), @acc[6] and @acc[0], @acc[11] mov 8*1($n_ptr), @acc[7] neg @acc[11] mov 8*2($n_ptr), @acc[8] and @acc[11], @acc[6] mov 8*3($n_ptr), @acc[9] and @acc[11], @acc[7] mov 8*4($n_ptr), @acc[10] and @acc[11], @acc[8] and @acc[11], @acc[9] and @acc[11], @acc[10] and 8*5($n_ptr), @acc[11] add @acc[0], @acc[6] adc @acc[1], @acc[7] adc @acc[2], @acc[8] adc @acc[3], @acc[9] adc @acc[4], @acc[10] adc @acc[5], @acc[11] sbb @acc[5], @acc[5] shr \$1, @acc[6] mov @acc[7], @acc[0] shr \$1, @acc[7] mov @acc[8], @acc[1] shr \$1, @acc[8] mov @acc[9], @acc[2] shr \$1, @acc[9] mov @acc[10], @acc[3] shr \$1, @acc[10] mov @acc[11], @acc[4] shr \$1, @acc[11] shl \$63, @acc[0] shl \$63, @acc[1] or @acc[6], @acc[0] shl \$63, @acc[2] or @acc[7], @acc[1] shl \$63, @acc[3] or @acc[8], @acc[2] shl \$63, @acc[4] or @acc[9], @acc[3] shl \$63, @acc[5] or @acc[10], @acc[4] or @acc[11], @acc[5] ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[6] .size __rshift_mod_384,.-__rshift_mod_384 .globl div_by_2_mod_384 .hidden div_by_2_mod_384 .type div_by_2_mod_384,\@function,3,"unwind" .align 32 div_by_2_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $r_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov $b_org, $n_ptr mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] call __rshift_mod_384 mov @acc[0], 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size div_by_2_mod_384,.-div_by_2_mod_384 ######################################################################## .globl lshift_mod_384 .hidden lshift_mod_384 .type lshift_mod_384,\@function,4,"unwind" .align 32 lshift_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $r_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] .Loop_lshift_mod_384: add @acc[0], @acc[0] adc @acc[1], @acc[1] adc @acc[2], @acc[2] mov @acc[0], @acc[6] adc @acc[3], @acc[3] mov @acc[1], @acc[7] adc @acc[4], @acc[4] mov @acc[2], @acc[8] adc @acc[5], @acc[5] mov @acc[3], @acc[9] sbb $r_ptr, $r_ptr sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[10] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[11] sbb 8*5($n_ptr), @acc[5] sbb \$0, $r_ptr mov (%rsp), $r_ptr cmovc @acc[6], @acc[0] cmovc @acc[7], @acc[1] cmovc @acc[8], @acc[2] cmovc @acc[9], @acc[3] cmovc @acc[10], @acc[4] cmovc @acc[11], @acc[5] dec %edx jnz .Loop_lshift_mod_384 mov @acc[0], 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size lshift_mod_384,.-lshift_mod_384 .type __lshift_mod_384,\@abi-omnipotent .align 32 __lshift_mod_384: add @acc[0], @acc[0] adc @acc[1], @acc[1] adc @acc[2], @acc[2] mov @acc[0], @acc[6] adc @acc[3], @acc[3] mov @acc[1], @acc[7] adc @acc[4], @acc[4] mov @acc[2], @acc[8] adc @acc[5], @acc[5] mov @acc[3], @acc[9] sbb $b_org, $b_org sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[10] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[11] sbb 8*5($n_ptr), @acc[5] sbb \$0, $b_org cmovc @acc[6], @acc[0] cmovc @acc[7], @acc[1] cmovc @acc[8], @acc[2] cmovc @acc[9], @acc[3] cmovc @acc[10], @acc[4] cmovc @acc[11], @acc[5] ret .size __lshift_mod_384,.-__lshift_mod_384 ######################################################################## .globl mul_by_3_mod_384 .hidden mul_by_3_mod_384 .type mul_by_3_mod_384,\@function,3,"unwind" .align 32 mul_by_3_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $a_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov $b_org, $n_ptr call __lshift_mod_384 mov (%rsp), $b_org #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size mul_by_3_mod_384,.-mul_by_3_mod_384 .globl mul_by_8_mod_384 .hidden mul_by_8_mod_384 .type mul_by_8_mod_384,\@function,3,"unwind" .align 32 mul_by_8_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov $b_org, $n_ptr call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 mov @acc[0], 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size mul_by_8_mod_384,.-mul_by_8_mod_384 ######################################################################## .globl mul_by_3_mod_384x .hidden mul_by_3_mod_384x .type mul_by_3_mod_384x,\@function,3,"unwind" .align 32 mul_by_3_mod_384x: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $a_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov $b_org, $n_ptr call __lshift_mod_384 mov (%rsp), $b_org #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded mov (%rsp), $a_ptr lea 8*6($r_ptr), $r_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*6($a_ptr), @acc[0] mov 8*7($a_ptr), @acc[1] mov 8*8($a_ptr), @acc[2] mov 8*9($a_ptr), @acc[3] mov 8*10($a_ptr), @acc[4] mov 8*11($a_ptr), @acc[5] call __lshift_mod_384 mov \$8*6, $b_org add (%rsp), $b_org #ifdef __SGX_LVI_HARDENING__ lfence #endif call __add_mod_384_a_is_loaded mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size mul_by_3_mod_384x,.-mul_by_3_mod_384x .globl mul_by_8_mod_384x .hidden mul_by_8_mod_384x .type mul_by_8_mod_384x,\@function,3,"unwind" .align 32 mul_by_8_mod_384x: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $a_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov $b_org, $n_ptr call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 mov (%rsp), $a_ptr mov @acc[0], 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 48+8*0($a_ptr), @acc[0] mov 48+8*1($a_ptr), @acc[1] mov 48+8*2($a_ptr), @acc[2] mov 48+8*3($a_ptr), @acc[3] mov 48+8*4($a_ptr), @acc[4] mov 48+8*5($a_ptr), @acc[5] call __lshift_mod_384 call __lshift_mod_384 call __lshift_mod_384 mov @acc[0], 48+8*0($r_ptr) mov @acc[1], 48+8*1($r_ptr) mov @acc[2], 48+8*2($r_ptr) mov @acc[3], 48+8*3($r_ptr) mov @acc[4], 48+8*4($r_ptr) mov @acc[5], 48+8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size mul_by_8_mod_384x,.-mul_by_8_mod_384x ######################################################################## .globl cneg_mod_384 .hidden cneg_mod_384 .type cneg_mod_384,\@function,4,"unwind" .align 32 cneg_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $b_org # condition flag .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), $b_org # load a[0:5] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov $b_org, @acc[0] mov 8*3($a_ptr), @acc[3] or @acc[1], $b_org mov 8*4($a_ptr), @acc[4] or @acc[2], $b_org mov 8*5($a_ptr), @acc[5] or @acc[3], $b_org mov \$-1, @acc[11] or @acc[4], $b_org or @acc[5], $b_org mov 8*0($n_ptr), @acc[6] # load n[0:5] cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0 mov 8*1($n_ptr), @acc[7] mov 8*2($n_ptr), @acc[8] and $b_org, @acc[6] # n[0:5] &= mask mov 8*3($n_ptr), @acc[9] and $b_org, @acc[7] mov 8*4($n_ptr), @acc[10] and $b_org, @acc[8] mov 8*5($n_ptr), @acc[11] and $b_org, @acc[9] mov 0(%rsp), $n_ptr # restore condition flag and $b_org, @acc[10] and $b_org, @acc[11] sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0 sbb @acc[1], @acc[7] sbb @acc[2], @acc[8] sbb @acc[3], @acc[9] sbb @acc[4], @acc[10] sbb @acc[5], @acc[11] or $n_ptr, $n_ptr # check condition flag cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5] cmovz @acc[1], @acc[7] cmovz @acc[2], @acc[8] mov @acc[6], 8*0($r_ptr) cmovz @acc[3], @acc[9] mov @acc[7], 8*1($r_ptr) cmovz @acc[4], @acc[10] mov @acc[8], 8*2($r_ptr) cmovz @acc[5], @acc[11] mov @acc[9], 8*3($r_ptr) mov @acc[10], 8*4($r_ptr) mov @acc[11], 8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size cneg_mod_384,.-cneg_mod_384 ######################################################################## .globl sub_mod_384 .hidden sub_mod_384 .type sub_mod_384,\@function,4,"unwind" .align 32 sub_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue call __sub_mod_384 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sub_mod_384,.-sub_mod_384 .type __sub_mod_384,\@abi-omnipotent .align 32 __sub_mod_384: #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] sub 8*0($b_org), @acc[0] mov 8*0($n_ptr), @acc[6] sbb 8*1($b_org), @acc[1] mov 8*1($n_ptr), @acc[7] sbb 8*2($b_org), @acc[2] mov 8*2($n_ptr), @acc[8] sbb 8*3($b_org), @acc[3] mov 8*3($n_ptr), @acc[9] sbb 8*4($b_org), @acc[4] mov 8*4($n_ptr), @acc[10] sbb 8*5($b_org), @acc[5] mov 8*5($n_ptr), @acc[11] sbb $b_org, $b_org and $b_org, @acc[6] and $b_org, @acc[7] and $b_org, @acc[8] and $b_org, @acc[9] and $b_org, @acc[10] and $b_org, @acc[11] add @acc[6], @acc[0] adc @acc[7], @acc[1] mov @acc[0], 8*0($r_ptr) adc @acc[8], @acc[2] mov @acc[1], 8*1($r_ptr) adc @acc[9], @acc[3] mov @acc[2], 8*2($r_ptr) adc @acc[10], @acc[4] mov @acc[3], 8*3($r_ptr) adc @acc[11], @acc[5] mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ret .size __sub_mod_384,.-__sub_mod_384 .globl sub_mod_384x .hidden sub_mod_384x .type sub_mod_384x,\@function,4,"unwind" .align 32 sub_mod_384x: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$24, %rsp .cfi_adjust_cfa_offset 24 .cfi_end_prologue mov $a_ptr, 8*0(%rsp) mov $b_org, 8*1(%rsp) lea 48($a_ptr), $a_ptr # a->im lea 48($b_org), $b_org # b->im lea 48($r_ptr), $r_ptr # ret->im call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod); mov 8*0(%rsp), $a_ptr # a->re mov 8*1(%rsp), $b_org # b->re lea -48($r_ptr), $r_ptr # ret->re call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod); mov 24+8*0(%rsp),%r15 .cfi_restore %r15 mov 24+8*1(%rsp),%r14 .cfi_restore %r14 mov 24+8*2(%rsp),%r13 .cfi_restore %r13 mov 24+8*3(%rsp),%r12 .cfi_restore %r12 mov 24+8*4(%rsp),%rbx .cfi_restore %rbx mov 24+8*5(%rsp),%rbp .cfi_restore %rbp lea 24+8*6(%rsp),%rsp .cfi_adjust_cfa_offset -24-8*6 .cfi_epilogue ret .cfi_endproc .size sub_mod_384x,.-sub_mod_384x ___ } { ###################################################### ret = a * (1 + i) my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx"); my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp")); $code.=<<___; .globl mul_by_1_plus_i_mod_384x .hidden mul_by_1_plus_i_mod_384x .type mul_by_1_plus_i_mod_384x,\@function,3,"unwind" .align 32 mul_by_1_plus_i_mod_384x: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$56, %rsp .cfi_adjust_cfa_offset 56 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov @acc[0], @acc[6] add 8*6($a_ptr), @acc[0] # a->re + a->im mov @acc[1], @acc[7] adc 8*7($a_ptr), @acc[1] mov @acc[2], @acc[8] adc 8*8($a_ptr), @acc[2] mov @acc[3], @acc[9] adc 8*9($a_ptr), @acc[3] mov @acc[4], @acc[10] adc 8*10($a_ptr), @acc[4] mov @acc[5], @acc[11] adc 8*11($a_ptr), @acc[5] mov $r_ptr, 8*6(%rsp) # offload r_ptr sbb $r_ptr, $r_ptr sub 8*6($a_ptr), @acc[6] # a->re - a->im sbb 8*7($a_ptr), @acc[7] sbb 8*8($a_ptr), @acc[8] sbb 8*9($a_ptr), @acc[9] sbb 8*10($a_ptr), @acc[10] sbb 8*11($a_ptr), @acc[11] sbb $a_ptr, $a_ptr mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry] mov 8*0($n_ptr), @acc[0] mov @acc[1], 8*1(%rsp) mov 8*1($n_ptr), @acc[1] mov @acc[2], 8*2(%rsp) mov 8*2($n_ptr), @acc[2] mov @acc[3], 8*3(%rsp) mov 8*3($n_ptr), @acc[3] mov @acc[4], 8*4(%rsp) and $a_ptr, @acc[0] mov 8*4($n_ptr), @acc[4] mov @acc[5], 8*5(%rsp) and $a_ptr, @acc[1] mov 8*5($n_ptr), @acc[5] and $a_ptr, @acc[2] and $a_ptr, @acc[3] and $a_ptr, @acc[4] and $a_ptr, @acc[5] mov 8*6(%rsp), $a_ptr # restore r_ptr add @acc[0], @acc[6] mov 8*0(%rsp), @acc[0] # restore a->re + a->im adc @acc[1], @acc[7] mov 8*1(%rsp), @acc[1] adc @acc[2], @acc[8] mov 8*2(%rsp), @acc[2] adc @acc[3], @acc[9] mov 8*3(%rsp), @acc[3] adc @acc[4], @acc[10] mov 8*4(%rsp), @acc[4] adc @acc[5], @acc[11] mov 8*5(%rsp), @acc[5] mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im mov @acc[0], @acc[6] mov @acc[7], 8*1($a_ptr) mov @acc[8], 8*2($a_ptr) mov @acc[1], @acc[7] mov @acc[9], 8*3($a_ptr) mov @acc[10], 8*4($a_ptr) mov @acc[2], @acc[8] mov @acc[11], 8*5($a_ptr) sub 8*0($n_ptr), @acc[0] mov @acc[3], @acc[9] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] mov @acc[4], @acc[10] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[11] sbb 8*5($n_ptr), @acc[5] sbb \$0, $r_ptr cmovc @acc[6], @acc[0] cmovc @acc[7], @acc[1] cmovc @acc[8], @acc[2] mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im cmovc @acc[9], @acc[3] mov @acc[1], 8*7($a_ptr) cmovc @acc[10], @acc[4] mov @acc[2], 8*8($a_ptr) cmovc @acc[11], @acc[5] mov @acc[3], 8*9($a_ptr) mov @acc[4], 8*10($a_ptr) mov @acc[5], 8*11($a_ptr) mov 56+8*0(%rsp),%r15 .cfi_restore %r15 mov 56+8*1(%rsp),%r14 .cfi_restore %r14 mov 56+8*2(%rsp),%r13 .cfi_restore %r13 mov 56+8*3(%rsp),%r12 .cfi_restore %r12 mov 56+8*4(%rsp),%rbx .cfi_restore %rbx mov 56+8*5(%rsp),%rbp .cfi_restore %rbp lea 56+8*6(%rsp),%rsp .cfi_adjust_cfa_offset -56-8*6 .cfi_epilogue ret .cfi_endproc .size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x ___ } { ###################################################### my ($r_ptr,$n_ptr) = ("%rdi","%rsi"); my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp")); $code.=<<___; .globl sgn0_pty_mod_384 .hidden sgn0_pty_mod_384 .type sgn0_pty_mod_384,\@function,2,"unwind" .align 32 sgn0_pty_mod_384: .cfi_startproc .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($r_ptr), @acc[0] mov 8*1($r_ptr), @acc[1] mov 8*2($r_ptr), @acc[2] mov 8*3($r_ptr), @acc[3] mov 8*4($r_ptr), @acc[4] mov 8*5($r_ptr), @acc[5] xor %rax, %rax mov @acc[0], $r_ptr add @acc[0], @acc[0] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, %rax sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, %rax not %rax # 2*x > p, which means "negative" and \$1, $r_ptr and \$2, %rax or $r_ptr, %rax # pack sign and parity .cfi_epilogue ret .cfi_endproc .size sgn0_pty_mod_384,.-sgn0_pty_mod_384 .globl sgn0_pty_mod_384x .hidden sgn0_pty_mod_384x .type sgn0_pty_mod_384x,\@function,2,"unwind" .align 32 sgn0_pty_mod_384x: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*6($r_ptr), @acc[0] # sgn0(a->im) mov 8*7($r_ptr), @acc[1] mov 8*8($r_ptr), @acc[2] mov 8*9($r_ptr), @acc[3] mov 8*10($r_ptr), @acc[4] mov 8*11($r_ptr), @acc[5] mov @acc[0], @acc[6] or @acc[1], @acc[0] or @acc[2], @acc[0] or @acc[3], @acc[0] or @acc[4], @acc[0] or @acc[5], @acc[0] lea 0($r_ptr), %rax # sgn0(a->re) xor $r_ptr, $r_ptr mov @acc[6], @acc[7] add @acc[6], @acc[6] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, $r_ptr sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, $r_ptr mov @acc[0], 0(%rsp) # a->im is zero or not not $r_ptr # 2*x > p, which means "negative" and \$1, @acc[7] and \$2, $r_ptr or @acc[7], $r_ptr # pack sign and parity mov 8*0(%rax), @acc[0] mov 8*1(%rax), @acc[1] mov 8*2(%rax), @acc[2] mov 8*3(%rax), @acc[3] mov 8*4(%rax), @acc[4] mov 8*5(%rax), @acc[5] mov @acc[0], @acc[6] or @acc[1], @acc[0] or @acc[2], @acc[0] or @acc[3], @acc[0] or @acc[4], @acc[0] or @acc[5], @acc[0] xor %rax, %rax mov @acc[6], @acc[7] add @acc[6], @acc[6] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, %rax sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, %rax mov 0(%rsp), @acc[6] not %rax # 2*x > p, which means "negative" test @acc[0], @acc[0] cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) test @acc[6], @acc[6] cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) and \$1, @acc[7] and \$2, %rax or @acc[7], %rax # pack sign and parity mov 8(%rsp), %rbx .cfi_restore %rbx mov 16(%rsp), %rbp .cfi_restore %rbp lea 24(%rsp), %rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x ___ } if (0) { my $inp = $win64 ? "%rcx" : "%rdi"; $code.=<<___; .globl nbits_384 .hidden nbits_384 .type nbits_384,\@abi-omnipotent .align 32 nbits_384: mov 8*5($inp), %r8 mov 8*4($inp), %r9 mov 8*3($inp), %r10 mov 8*2($inp), %r11 mov \$-1, %rdx mov \$127, %eax bsr %r8, %r8 cmovnz %rdx,%r9 cmovz %rax,%r8 bsr %r9, %r9 cmovnz %rdx,%r10 cmovz %rax,%r9 xor \$63,%r8 bsr %r10, %r10 cmovnz %rdx, %r11 cmovz %rax, %r10 xor \$63,%r9 add %r8, %r9 mov 8*1($inp), %r8 bsr %r11, %r11 cmovnz %rdx, %r8 cmovz %rax, %r11 xor \$63, %r10 add %r9, %r10 mov 8*0($inp), %r9 bsr %r8, %r8 cmovnz %rdx, %r9 cmovz %rax, %r8 xor \$63, %r11 add %r10, %r11 bsr %r9, %r9 cmovz %rax, %r9 xor \$63, %r8 add %r11, %r8 xor \$63, %r9 add %r8, %r9 mov \$384, %eax sub %r9, %rax ret .size nbits_384,.-nbits_384 ___ } if (1) { my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d") : ("%rdi", "%rsi", "%rdx", "%ecx"); sub vec_select { my $sz = shift; my $half = $sz/2; my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3)); $code.=<<___; .globl vec_select_$sz .hidden vec_select_$sz .type vec_select_$sz,\@abi-omnipotent .align 32 vec_select_$sz: movd $select, %xmm5 pxor %xmm4,%xmm4 pshufd \$0,%xmm5,%xmm5 # broadcast #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu ($inp1),$xmm0 lea $half($inp1),$inp1 pcmpeqd %xmm4,%xmm5 movdqu ($inp2),$xmm1 lea $half($inp2),$inp2 pcmpeqd %xmm5,%xmm4 lea $half($out),$out ___ for($i=0; $i<$sz-16; $i+=16) { $code.=<<___; pand %xmm4,$xmm0 movdqu $i+16-$half($inp1),$xmm2 pand %xmm5,$xmm1 movdqu $i+16-$half($inp2),$xmm3 por $xmm1,$xmm0 movdqu $xmm0,$i-$half($out) ___ ($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1); } $code.=<<___; pand %xmm4,$xmm0 pand %xmm5,$xmm1 por $xmm1,$xmm0 movdqu $xmm0,$i-$half($out) ret .size vec_select_$sz,.-vec_select_$sz ___ } vec_select(32); vec_select(48); vec_select(96); vec_select(192); vec_select(144); vec_select(288); } { my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi"); $code.=<<___; .globl vec_prefetch .hidden vec_prefetch .type vec_prefetch,\@abi-omnipotent .align 32 vec_prefetch: leaq -1($inp,$end), $end mov \$64, %rax xor %r8, %r8 #ifdef __SGX_LVI_HARDENING__ lfence #endif prefetchnta ($inp) lea ($inp,%rax), $inp cmp $end, $inp cmova $end, $inp cmova %r8, %rax prefetchnta ($inp) lea ($inp,%rax), $inp cmp $end, $inp cmova $end, $inp cmova %r8, %rax prefetchnta ($inp) lea ($inp,%rax), $inp cmp $end, $inp cmova $end, $inp cmova %r8, %rax prefetchnta ($inp) lea ($inp,%rax), $inp cmp $end, $inp cmova $end, $inp cmova %r8, %rax prefetchnta ($inp) lea ($inp,%rax), $inp cmp $end, $inp cmova $end, $inp cmova %r8, %rax prefetchnta ($inp) lea ($inp,%rax), $inp cmp $end, $inp cmova $end, $inp prefetchnta ($inp) ret .size vec_prefetch,.-vec_prefetch ___ my $len = $win64 ? "%edx" : "%esi"; $code.=<<___; .globl vec_is_zero_16x .hidden vec_is_zero_16x .type vec_is_zero_16x,\@abi-omnipotent .align 32 vec_is_zero_16x: shr \$4, $len #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu ($inp), %xmm0 lea 16($inp), $inp .Loop_is_zero: dec $len jz .Loop_is_zero_done movdqu ($inp), %xmm1 lea 16($inp), $inp por %xmm1, %xmm0 jmp .Loop_is_zero .Loop_is_zero_done: pshufd \$0x4e, %xmm0, %xmm1 por %xmm1, %xmm0 movq %xmm0, %rax inc $len # now it's 1 test %rax, %rax cmovnz $len, %eax xor \$1, %eax ret .size vec_is_zero_16x,.-vec_is_zero_16x ___ } { my ($inp1, $inp2, $len) = $win64 ? ("%rcx", "%rdx", "%r8d") : ("%rdi", "%rsi", "%edx"); $code.=<<___; .globl vec_is_equal_16x .hidden vec_is_equal_16x .type vec_is_equal_16x,\@abi-omnipotent .align 32 vec_is_equal_16x: shr \$4, $len #ifdef __SGX_LVI_HARDENING__ lfence #endif movdqu ($inp1), %xmm0 movdqu ($inp2), %xmm1 sub $inp1, $inp2 lea 16($inp1), $inp1 pxor %xmm1, %xmm0 .Loop_is_equal: dec $len jz .Loop_is_equal_done movdqu ($inp1), %xmm1 movdqu ($inp1,$inp2), %xmm2 lea 16($inp1), $inp1 pxor %xmm2, %xmm1 por %xmm1, %xmm0 jmp .Loop_is_equal .Loop_is_equal_done: pshufd \$0x4e, %xmm0, %xmm1 por %xmm1, %xmm0 movq %xmm0, %rax inc $len # now it's 1 test %rax, %rax cmovnz $len, %eax xor \$1, %eax ret .size vec_is_equal_16x,.-vec_is_equal_16x ___ } print $code; close STDOUT; ================================================ FILE: src/asm/add_mod_384x384-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; # common argument layout ($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); $b_ptr = "%rbx"; # common accumulator layout @acc=map("%r$_",(8..15)); ############################################################ 384x384 add/sub # Double-width addition/subtraction modulo n<<384, as opposite to # naively expected modulo n*n. It works because n<<384 is the actual # input boundary condition for Montgomery reduction, not n*n. # Just in case, this is duplicated, but only one module is # supposed to be linked... { my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected # except for $n_ptr and $r_ptr $code.=<<___; .text .globl add_mod_384x384 .hidden add_mod_384x384 .type add_mod_384x384,\@function,4,"unwind" .align 32 add_mod_384x384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov 8*6($a_ptr), @acc[6] add 8*0($b_org), @acc[0] mov 8*7($a_ptr), @acc[7] adc 8*1($b_org), @acc[1] mov 8*8($a_ptr), @acc[8] adc 8*2($b_org), @acc[2] mov 8*9($a_ptr), @acc[9] adc 8*3($b_org), @acc[3] mov 8*10($a_ptr), @acc[10] adc 8*4($b_org), @acc[4] mov 8*11($a_ptr), @acc[11] adc 8*5($b_org), @acc[5] mov @acc[0], 8*0($r_ptr) adc 8*6($b_org), @acc[6] mov @acc[1], 8*1($r_ptr) adc 8*7($b_org), @acc[7] mov @acc[2], 8*2($r_ptr) adc 8*8($b_org), @acc[8] mov @acc[4], 8*4($r_ptr) mov @acc[6], @acc[0] adc 8*9($b_org), @acc[9] mov @acc[3], 8*3($r_ptr) mov @acc[7], @acc[1] adc 8*10($b_org), @acc[10] mov @acc[5], 8*5($r_ptr) mov @acc[8], @acc[2] adc 8*11($b_org), @acc[11] mov @acc[9], @acc[3] sbb $b_org, $b_org sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[7] mov @acc[10], @acc[4] sbb 8*2($n_ptr), @acc[8] sbb 8*3($n_ptr), @acc[9] sbb 8*4($n_ptr), @acc[10] mov @acc[11], @acc[5] sbb 8*5($n_ptr), @acc[11] sbb \$0, $b_org cmovc @acc[0], @acc[6] cmovc @acc[1], @acc[7] cmovc @acc[2], @acc[8] mov @acc[6], 8*6($r_ptr) cmovc @acc[3], @acc[9] mov @acc[7], 8*7($r_ptr) cmovc @acc[4], @acc[10] mov @acc[8], 8*8($r_ptr) cmovc @acc[5], @acc[11] mov @acc[9], 8*9($r_ptr) mov @acc[10], 8*10($r_ptr) mov @acc[11], 8*11($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size add_mod_384x384,.-add_mod_384x384 .globl sub_mod_384x384 .hidden sub_mod_384x384 .type sub_mod_384x384,\@function,4,"unwind" .align 32 sub_mod_384x384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov 8*6($a_ptr), @acc[6] sub 8*0($b_org), @acc[0] mov 8*7($a_ptr), @acc[7] sbb 8*1($b_org), @acc[1] mov 8*8($a_ptr), @acc[8] sbb 8*2($b_org), @acc[2] mov 8*9($a_ptr), @acc[9] sbb 8*3($b_org), @acc[3] mov 8*10($a_ptr), @acc[10] sbb 8*4($b_org), @acc[4] mov 8*11($a_ptr), @acc[11] sbb 8*5($b_org), @acc[5] mov @acc[0], 8*0($r_ptr) sbb 8*6($b_org), @acc[6] mov 8*0($n_ptr), @acc[0] mov @acc[1], 8*1($r_ptr) sbb 8*7($b_org), @acc[7] mov 8*1($n_ptr), @acc[1] mov @acc[2], 8*2($r_ptr) sbb 8*8($b_org), @acc[8] mov 8*2($n_ptr), @acc[2] mov @acc[3], 8*3($r_ptr) sbb 8*9($b_org), @acc[9] mov 8*3($n_ptr), @acc[3] mov @acc[4], 8*4($r_ptr) sbb 8*10($b_org), @acc[10] mov 8*4($n_ptr), @acc[4] mov @acc[5], 8*5($r_ptr) sbb 8*11($b_org), @acc[11] mov 8*5($n_ptr), @acc[5] sbb $b_org, $b_org and $b_org, @acc[0] and $b_org, @acc[1] and $b_org, @acc[2] and $b_org, @acc[3] and $b_org, @acc[4] and $b_org, @acc[5] add @acc[0], @acc[6] adc @acc[1], @acc[7] mov @acc[6], 8*6($r_ptr) adc @acc[2], @acc[8] mov @acc[7], 8*7($r_ptr) adc @acc[3], @acc[9] mov @acc[8], 8*8($r_ptr) adc @acc[4], @acc[10] mov @acc[9], 8*9($r_ptr) adc @acc[5], @acc[11] mov @acc[10], 8*10($r_ptr) mov @acc[11], 8*11($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sub_mod_384x384,.-sub_mod_384x384 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/arm-xlate.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # ARM assembler distiller/adapter by \@dot-asm. use strict; ################################################################ # Recognized "flavour"-s are: # # linux[32|64] GNU assembler, effectively pass-through # ios[32|64] global symbols' decorations, PIC tweaks, etc. # win[32|64] Visual Studio armasm-specific directives # coff[32|64] e.g. clang --target=arm-windows ... # cheri64 L64P128 platform # my $flavour = shift; $flavour = "linux" if (!$flavour or $flavour eq "void"); my $output = shift; open STDOUT,">$output" || die "can't open $output: $!"; my %GLOBALS; my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0; my $in_proc; # used with 'windows' flavour ################################################################ # directives which need special treatment on different platforms ################################################################ my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu my $rodata = sub { SWITCH: for ($flavour) { /linux|cheri/ && return ".section\t.rodata"; /ios/ && return ".section\t__TEXT,__const"; /coff/ && return ".section\t.rdata,\"dr\""; /win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8"; last; } }; my $hidden = sub { if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } } if ($flavour !~ /linux|cheri/); my $comm = sub { my @args = split(/,\s*/,shift); my $name = @args[0]; my $global = \$GLOBALS{$name}; my $ret; if ($flavour =~ /ios32/) { $ret = ".comm\t_$name,@args[1]\n"; $ret .= ".non_lazy_symbol_pointer\n"; $ret .= "$name:\n"; $ret .= ".indirect_symbol\t_$name\n"; $ret .= ".long\t0\n"; $ret .= ".previous"; $name = "_$name"; } elsif ($flavour =~ /ios64/) { $name = "_$name"; $ret = ".comm\t$name,@args[1]"; } elsif ($flavour =~ /win/) { $ret = "\tCOMMON\t|$name|,@args[1]"; } elsif ($flavour =~ /coff/) { $ret = ".comm\t$name,@args[1]"; } else { $ret = ".comm\t".join(',',@args); } $$global = $name; $ret; }; my $globl = sub { my $name = shift; my $global = \$GLOBALS{$name}; my $ret; SWITCH: for ($flavour) { /ios/ && do { $name = "_$name"; last; }; /win/ && do { $ret = ""; last; }; } $ret = ".globl $name" if (!defined($ret)); $$global = $name; $ret; }; my $global = $globl; my $extern = sub { &$globl(@_); if ($flavour =~ /win/) { return "\tEXTERN\t@_"; } return; # return nothing }; my $type = sub { my $arg = join(',',@_); my $ret; SWITCH: for ($flavour) { /ios32/ && do { if ($arg =~ /(\w+),\s*%function/) { $ret = "#ifdef __thumb2__\n" . ".thumb_func $1\n" . "#endif"; } last; }; /win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) { my $type = "[DATA]"; if ($2 eq "function") { $in_proc = $1; $type = "[FUNC]"; } $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type" : ""; } last; }; /coff/ && do { if ($arg =~ /(\w+),\s*%function/) { $ret = ".def $1;\n". ".type 32;\n". ".endef"; } last; }; } return $ret; } if ($flavour !~ /linux|cheri/); my $size = sub { if ($in_proc && $flavour =~ /win/) { $in_proc = undef; return "\tENDP"; } } if ($flavour !~ /linux|cheri/); my $inst = sub { if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); } else { ".long\t".join(',',@_); } } if ($flavour !~ /linux|cheri/); my $asciz = sub { my $line = join(",",@_); if ($line =~ /^"(.*)"$/) { if ($flavour =~ /win/) { "\tDCB\t$line,0\n\tALIGN\t4"; } else { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } } else { ""; } }; my $align = sub { "\tALIGN\t".2**@_[0]; } if ($flavour =~ /win/); $align = sub { ".p2align\t".@_[0]; } if ($flavour =~ /coff/); my $byte = sub { "\tDCB\t".join(',',@_); } if ($flavour =~ /win/); my $short = sub { "\tDCWU\t".join(',',@_); } if ($flavour =~ /win/); my $word = sub { "\tDCDU\t".join(',',@_); } if ($flavour =~ /win/); my $long = $word if ($flavour =~ /win/); my $quad = sub { "\tDCQU\t".join(',',@_); } if ($flavour =~ /win/); my $skip = sub { "\tSPACE\t".shift; } if ($flavour =~ /win/); my $code = sub { "\tCODE@_[0]"; } if ($flavour =~ /win/); my $thumb = sub { # .thumb should appear prior .text in source "# define ARM THUMB\n" . "\tTHUMB"; } if ($flavour =~ /win/); my $text = sub { "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM"); } if ($flavour =~ /win/); my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax my $rva = sub { # .rva directive comes in handy only on 32-bit Windows, i.e. it can # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections. # However! Corresponding compilers don't seem to bet on PIC, which # raises the question why would assembler programmer have to jump # through the hoops? But just in case, it would go as following: # # ldr r1,.LOPENSSL_armcap # ldr r2,.LOPENSSL_armcap+4 # adr r0,.LOPENSSL_armcap # bic r1,r1,#1 ; de-thumb-ify link.exe's ideas # sub r0,r0,r1 ; r0 is image base now # ldr r0,[r0,r2] # ... #.LOPENSSL_armcap: # .rva .LOPENSSL_armcap ; self-reference # .rva OPENSSL_armcap_P ; real target # # Non-position-independent [and ISA-neutral] alternative is so much # simpler: # # ldr r0,.LOPENSSL_armcap # ldr r0,[r0] # ... #.LOPENSSL_armcap: # .long OPENSSL_armcap_P # "\tDCDU\t@_[0]\n\tRELOC\t2" } if ($flavour =~ /win(?!64)/); ################################################################ # some broken instructions in Visual Studio armasm[64]... my $it = sub {} if ($flavour =~ /win32/); # omit 'it' my $ext = sub { "\text8\t".join(',',@_); } if ($flavour =~ /win64/); my $csel = sub { my ($args,$comment) = split(m|\s*//|,shift); my @regs = split(m|,\s*|,$args); my $cond = pop(@regs); "\tcsel$cond\t".join(',',@regs); } if ($flavour =~ /win64/); my $csetm = sub { my ($args,$comment) = split(m|\s*//|,shift); my @regs = split(m|,\s*|,$args); my $cond = pop(@regs); "\tcsetm$cond\t".join(',',@regs); } if ($flavour =~ /win64/); # ... then conditional branch instructions are also broken, but # maintaining all the variants is tedious, so I kludge-fix it # elsewhere... ################################################################ # CHERI-specific synthetic instructions my $alignd = sub { my ($args,$comment) = split(m|\s*//|,shift); $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; my @regs = split(m|,\s*|,$args); "\talignd\t".join(',',@regs); }; my $scvalue = sub { my ($args,$comment) = split(m|\s*//|,shift); $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; my @regs = split(m|,\s*|,$args); @regs[2] =~ s/\bc([0-9])\b/x$1/; "\tscvalue\t".join(',',@regs); }; my $scbnds = sub { my ($args,$comment) = split(m|\s*//|,shift); $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; my @regs = split(m|,\s*|,$args); @regs[2] =~ s/\bc([0-9])\b/x$1/; "\tscbnds\t".join(',',@regs); }; my $cadd = sub { my ($args,$comment) = split(m|\s*//|,shift); if ($flavour =~ /cheri/) { $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; } else { $args =~ s/\bc([0-9]+)\b/x$1/g; } my @regs = split(m|,\s*|,$args); @regs[2] =~ s/c([0-9])/x$1/; "\tadd\t".join(',',@regs); }; my $csub = sub { my ($args,$comment) = split(m|\s*//|,shift); if ($flavour =~ /cheri/) { $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; } else { $args =~ s/\bc([0-9]+)\b/x$1/g; } my @regs = split(m|,\s*|,$args); @regs[2] =~ s/c([0-9])/x$1/; "\tsub\t".join(',',@regs); }; my $cmov = sub { my $args = shift; if ($flavour =~ /cheri/) { $args =~ s/\b(?:x([0-9]+)|(sp))\b/c$1$2/g; } else { $args =~ s/\bc([0-9]+)\b/x$1/g; } "\tmov\t".$args; }; my $adr = sub { my $args = shift; $args =~ s/\bx([0-9]+)\b/c$1/g; "\tadr\t".$args; } if ($flavour =~ /cheri/); ################################################################ my $adrp = sub { my ($args,$comment) = split(m|\s*//|,shift); "\tadrp\t$args\@PAGE"; } if ($flavour =~ /ios64/); my $paciasp = sub { ($flavour =~ /linux|cheri/) ? "\thint\t#PACI_HINT" : "\thint\t#25"; }; my $autiasp = sub { ($flavour =~ /linux|cheri/) ? "\thint\t#AUTI_HINT" : "\thint\t#29"; }; sub range { my ($r,$sfx,$start,$end) = @_; join(",",map("$r$_$sfx",($start..$end))); } sub expand_line { my $line = shift; my @ret = (); pos($line)=0; while ($line =~ m/\G[^@\/\{\"]*/g) { if ($line =~ m/\G(@|\/\/|$)/gc) { last; } elsif ($line =~ m/\G\{/gc) { my $saved_pos = pos($line); $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; pos($line) = $saved_pos; $line =~ m/\G[^\}]*\}/g; } elsif ($line =~ m/\G\"/gc) { $line =~ m/\G[^\"]*\"/g; } } $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; if ($flavour =~ /cheri/) { $line =~ s/\[\s*(?:x([0-9]+)|(sp))\s*(,?.*)\]/[c$1$2$3]/; } else { $line =~ s/\bc([0-9]+)\b/x$1/g; $line =~ s/\bcsp\b/sp/g; } if ($flavour =~ /win/) { # adjust alignment hints, "[rN,:32]" -> "[rN@32]" $line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/; # adjust local labels, ".Lwhatever" -> "|$Lwhatever|" $line =~ s/\.(L\w{2,})/|\$$1|/g; # omit "#:lo12:" on win64 $line =~ s/#:lo12://; } elsif ($flavour =~ /coff(?!64)/) { $line =~ s/\.L(\w{2,})/(\$ML$1)/g; } elsif ($flavour =~ /ios64/) { $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/; } if ($flavour =~ /64/) { # "vX.Md[N]" -> "vX.d[N] $line =~ s/\b(v[0-9]+)\.[1-9]+([bhsd]\[[0-9]+\])/$1.$2/; } return $line; } if ($flavour =~ /win(32|64)/) { print<<___; GBLA __SIZEOF_POINTER__ __SIZEOF_POINTER__ SETA $1/8 ___ } elsif ($flavour =~ /linux|cheri/) { print<<___; #if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT==2 # define PACI_HINT 27 # define AUTI_HINT 31 #else # define PACI_HINT 25 # define AUTI_HINT 29 #endif ___ } while(my $line=<>) { if ($flavour =~ /win/) { if ($line =~ m/^#\s*(ifdef|ifndef|else|endif)\b(.*)/) { my ($op, $arg) = ($1, $2); $op = "if :def:" if ($op eq "ifdef"); $op = "if :lnot::def:" if ($op eq "ifndef"); print " ".$op.$arg."\n"; next; } $line =~ s|//.*||; } # fix up assembler-specific commentary delimiter $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/); if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; } $line =~ s|/\*.*\*/||; # get rid of C-style comments... $line =~ s|^\s+||; # ... and skip white spaces in beginning... $line =~ s|\s+$||; # ... and at the end { $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); } { $line =~ s|(^[\.\w]+)\:\s*||; my $label = $1; if ($label) { $label = ($GLOBALS{$label} or $label); if ($flavour =~ /win/) { $label =~ s|^\.L(?=\w)|\$L|; printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : ""); } else { $label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/); printf "%s:", $label; } } } if ($line !~ m/^[#@;]/) { $line =~ s|^\s*(\.?)(\S+)\s*||; my $c = $1; $c = "\t" if ($c eq ""); my $mnemonic = $2; my $opcode; if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { $opcode = eval("\$$1_$2"); } else { $opcode = eval("\$$mnemonic"); } my $arg=expand_line($line); if (ref($opcode) eq 'CODE') { $line = &$opcode($arg); } elsif ($mnemonic) { if ($flavour =~ /win64/) { # "b.cond" -> "bcond", kludge-fix:-( $mnemonic =~ s/^b\.([a-z]{2}$)/b$1/; } $line = $c.$mnemonic; $line.= "\t$arg" if ($arg ne ""); } } print $line if ($line); print "\n"; } if ($flavour =~ /win/) { print "\tEND\n"; } elsif ($flavour =~ /linux|cheri/) { # -mbranch-protection=standanrd segment, snatched from compiler -S output print <<___; #if defined(__ARM_FEATURE_BTI_DEFAULT) || defined(__ARM_FEATURE_PAC_DEFAULT) .section .note.GNU-stack,"",\@progbits .section .note.gnu.property,"a",\@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000000,4,3 .align 3 2: #endif ___ } close STDOUT; ================================================ FILE: src/asm/ct_inverse_mod_256-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast Euclidean inversion as suggested in # https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 - # on Cortex-A57. # # void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, # const vec256 modx); # $python_ref.=<<'___'; def ct_inverse_mod_256(inp, mod): a, u = inp, 1 b, v = mod, 0 k = 31 mask = (1 << k) - 1 for i in range(0, 512 // k - 1): # __ab_approximation_31 n = max(a.bit_length(), b.bit_length()) if n < 64: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-k-2)) << k) b_ = (b & mask) | ((b >> (n-k-2)) << k) # __inner_loop_31 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 # __smul_256_n_shift_by_31 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if a < 0: a, f0, g0 = -a, -f0, -g0 if b < 0: b, f1, g1 = -b, -f1, -g1 # __smul_512x63 u, v = u*f0 + v*g0, u*f1 + v*g1 if 512 % k + k: f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, 512 % k + k): if a & 1: if a < b: a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 a, f0, g0 = a-b, f0-f1, g0-g1 a, f1, g1 = a >> 1, f1 << 1, g1 << 1 v = u*f1 + v*g1 mod <<= 512 - mod.bit_length() # align to the left if v < 0: v += mod if v < 0: v += mod elif v == 1<<512: v -= mod return v & (2**512 - 1) # to be reduced % mod ___ $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); my @acc=map("x$_",(4..11)); my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17)); my $cnt = $n_ptr; my @t = map("x$_",(19..26)); my ($a_lo, $b_lo) = @acc[3,7]; $frame = 16+2*512; $code.=<<___; .text .globl ct_inverse_mod_256 .hidden ct_inverse_mod_256 .type ct_inverse_mod_256, %function .align 5 ct_inverse_mod_256: paciasp stp c29, c30, [csp,#-10*__SIZEOF_POINTER__]! add c29, csp, #0 stp c19, c20, [csp,#2*__SIZEOF_POINTER__] stp c21, c22, [csp,#4*__SIZEOF_POINTER__] stp c23, c24, [csp,#6*__SIZEOF_POINTER__] stp c25, c26, [csp,#8*__SIZEOF_POINTER__] sub csp, csp, #$frame ldp @acc[0], @acc[1], [$in_ptr,#8*0] ldp @acc[2], @acc[3], [$in_ptr,#8*2] #ifdef __CHERI_PURE_CAPABILITY__ cadd $in_ptr, csp, #16+511 alignd $in_ptr, $in_ptr, #9 scbnds $in_ptr, $in_ptr, #512 #else add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot and $in_ptr, $in_ptr, #-512 // in the frame... #endif str c0, [csp] // offload out_ptr ldp @acc[4], @acc[5], [$n_ptr,#8*0] ldp @acc[6], @acc[7], [$n_ptr,#8*2] stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| stp @acc[2], @acc[3], [$in_ptr,#8*2] stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b| stp @acc[6], @acc[7], [$in_ptr,#8*6] ////////////////////////////////////////// first iteration bl .Lab_approximation_31_256_loaded eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, $in_ptr, $out_ptr #endif bl __smul_256_n_shift_by_31 str $f0,[$out_ptr,#8*8] // initialize |u| with |f0| mov $f0, $f1 // |f1| mov $g0, $g1 // |g1| cadd $out_ptr, $out_ptr, #8*4 // pointer to dst |b| bl __smul_256_n_shift_by_31 str $f0, [$out_ptr,#8*10] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, $out_ptr, $in_ptr #endif bl __ab_approximation_31_256 eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, $in_ptr, $out_ptr #endif bl __smul_256_n_shift_by_31 mov $f_, $f0 // corrected |f0| mov $g_, $g0 // corrected |g0| mov $f0, $f1 // |f1| mov $g0, $g1 // |g1| cadd $out_ptr, $out_ptr, #8*4 // pointer to destination |b| bl __smul_256_n_shift_by_31 ldr @acc[4], [$in_ptr,#8*8] // |u| ldr @acc[5], [$in_ptr,#8*14] // |v| madd @acc[0], $f_, @acc[4], xzr // |u|*|f0| madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0| asr @acc[1], @acc[0], #63 // sign extension stp @acc[0], @acc[1], [$out_ptr,#8*4] stp @acc[1], @acc[1], [$out_ptr,#8*6] madd @acc[0], $f0, @acc[4], xzr // |u|*|f1| madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1| asr @acc[1], @acc[0], #63 // sign extension stp @acc[0], @acc[1], [$out_ptr,#8*10] stp @acc[1], @acc[1], [$out_ptr,#8*12] ___ for($i=2; $i<15; $i++) { $code.=<<___; eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, $out_ptr, $in_ptr #endif bl __ab_approximation_31_256 eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, $in_ptr, $out_ptr #endif bl __smul_256_n_shift_by_31 mov $f_, $f0 // corrected |f0| mov $g_, $g0 // corrected |g0| mov $f0, $f1 // |f1| mov $g0, $g1 // |g1| cadd $out_ptr, $out_ptr, #8*4 // pointer to destination |b| bl __smul_256_n_shift_by_31 cadd $out_ptr, $out_ptr, #8*4 // pointer to destination |u| bl __smul_256x63 ___ $code.=<<___ if ($i==7); asr @t[5], @t[5], #63 str @t[5], [$out_ptr,#8*4] ___ $code.=<<___ if ($i>7); adc @t[3], @t[3], @t[4] str @t[3], [$out_ptr,#8*4] ___ $code.=<<___; mov $f_, $f0 // corrected |f1| mov $g_, $g0 // corrected |g1| cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |v| bl __smul_256x63 ___ $code.=<<___ if ($i>7); bl __smul_512x63_tail ___ $code.=<<___ if ($i==7); asr @t[5], @t[5], #63 // sign extension stp @t[5], @t[5], [$out_ptr,#8*4] stp @t[5], @t[5], [$out_ptr,#8*6] ___ } $code.=<<___; ////////////////////////////////////////// two[!] last iterations eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, $out_ptr, $in_ptr #endif mov $cnt, #47 // 31 + 512 % 31 //bl __ab_approximation_62_256 // |a| and |b| are exact, ldr $a_lo, [$in_ptr,#8*0] // just load ldr $b_lo, [$in_ptr,#8*4] bl __inner_loop_62_256 mov $f_, $f1 mov $g_, $g1 ldr c0, [csp] // original out_ptr bl __smul_256x63 bl __smul_512x63_tail ldr c30, [c29,#__SIZEOF_POINTER__] smulh @t[1], @acc[3], $g_ // figure out top-most limb ldp @acc[4], @acc[5], [$nx_ptr,#8*0] adc @t[4], @t[4], @t[6] ldp @acc[6], @acc[7], [$nx_ptr,#8*2] add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1 asr @t[0], @t[1], #63 // sign as mask and @t[4], @acc[4], @t[0] // add mod<<256 conditionally and @t[5], @acc[5], @t[0] adds @acc[0], @acc[0], @t[4] and @t[6], @acc[6], @t[0] adcs @acc[1], @acc[1], @t[5] and @t[7], @acc[7], @t[0] adcs @acc[2], @acc[2], @t[6] adcs @acc[3], @t[3], @t[7] adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1 neg @t[0], @t[1] orr @t[1], @t[1], @t[0] // excess bit or sign as mask asr @t[0], @t[0], #63 // excess bit as mask and @acc[4], @acc[4], @t[1] // mask |mod| and @acc[5], @acc[5], @t[1] and @acc[6], @acc[6], @t[1] and @acc[7], @acc[7], @t[1] eor @acc[4], @acc[4], @t[0] // conditionally negate |mod| eor @acc[5], @acc[5], @t[0] adds @acc[4], @acc[4], @t[0], lsr#63 eor @acc[6], @acc[6], @t[0] adcs @acc[5], @acc[5], xzr eor @acc[7], @acc[7], @t[0] adcs @acc[6], @acc[6], xzr adc @acc[7], @acc[7], xzr adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256 adcs @acc[1], @acc[1], @acc[5] adcs @acc[2], @acc[2], @acc[6] stp @acc[0], @acc[1], [$out_ptr,#8*4] adc @acc[3], @acc[3], @acc[7] stp @acc[2], @acc[3], [$out_ptr,#8*6] add csp, csp, #$frame ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] ldr c29, [csp],#10*__SIZEOF_POINTER__ autiasp ret .size ct_inverse_mod_256,.-ct_inverse_mod_256 //////////////////////////////////////////////////////////////////////// .type __smul_256x63, %function .align 5 __smul_256x63: ___ for($j=0; $j<2; $j++) { my $f_ = $f_; $f_ = $g_ if ($j); my @acc = @acc; @acc = @acc[4..7] if ($j); my $k = 8*8+8*6*$j; $code.=<<___; ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) ldr @t[3+$j], [$in_ptr,#8*4+$k] eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) sub $f_, $f_, $f1 eor @acc[1], @acc[1], $f1 adds @acc[0], @acc[0], $f1, lsr#63 eor @acc[2], @acc[2], $f1 adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], $f1 adcs @acc[2], @acc[2], xzr eor @t[3+$j], @t[3+$j], $f1 umulh @t[0], @acc[0], $f_ adcs @acc[3], @acc[3], xzr umulh @t[1], @acc[1], $f_ adcs @t[3+$j], @t[3+$j], xzr umulh @t[2], @acc[2], $f_ ___ $code.=<<___ if ($j!=0); adc $g1, xzr, xzr // used in __smul_512x63_tail ___ $code.=<<___; mul @acc[0], @acc[0], $f_ cmp $f_, #0 mul @acc[1], @acc[1], $f_ csel @t[3+$j], @t[3+$j], xzr, ne mul @acc[2], @acc[2], $f_ adds @acc[1], @acc[1], @t[0] mul @t[5+$j], @acc[3], $f_ adcs @acc[2], @acc[2], @t[1] adcs @t[5+$j], @t[5+$j], @t[2] ___ $code.=<<___ if ($j==0); adc @t[7], xzr, xzr ___ } $code.=<<___; adc @t[7], @t[7], xzr adds @acc[0], @acc[0], @acc[4] adcs @acc[1], @acc[1], @acc[5] adcs @acc[2], @acc[2], @acc[6] stp @acc[0], @acc[1], [$out_ptr,#8*0] adcs @t[5], @t[5], @t[6] stp @acc[2], @t[5], [$out_ptr,#8*2] ret .size __smul_256x63,.-__smul_256x63 .type __smul_512x63_tail, %function .align 5 __smul_512x63_tail: umulh @t[5], @acc[3], $f_ ldr @acc[1], [$in_ptr,#8*19] // load rest of |v| adc @t[7], @t[7], xzr ldp @acc[2], @acc[3], [$in_ptr,#8*20] and @t[3], @t[3], $f_ umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain asr @t[6], @t[5], #63 eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v| eor @acc[2], @acc[2], $f1 adds @acc[1], @acc[1], $g1 eor @acc[3], @acc[3], $f1 adcs @acc[2], @acc[2], xzr umulh @t[0], @t[4], $g_ adc @acc[3], @acc[3], xzr umulh @t[1], @acc[1], $g_ add @acc[7], @acc[7], @t[7] umulh @t[2], @acc[2], $g_ mul @acc[0], @t[4], $g_ mul @acc[1], @acc[1], $g_ adds @acc[0], @acc[0], @acc[7] mul @acc[2], @acc[2], $g_ adcs @acc[1], @acc[1], @t[0] mul @t[3], @acc[3], $g_ adcs @acc[2], @acc[2], @t[1] adcs @t[3], @t[3], @t[2] adc @t[4], xzr, xzr // used in the final step adds @acc[0], @acc[0], @t[5] adcs @acc[1], @acc[1], @t[6] adcs @acc[2], @acc[2], @t[6] stp @acc[0], @acc[1], [$out_ptr,#8*4] adcs @t[3], @t[3], @t[6] // carry is used in the final step stp @acc[2], @t[3], [$out_ptr,#8*6] ret .size __smul_512x63_tail,.-__smul_512x63_tail .type __smul_256_n_shift_by_31, %function .align 5 __smul_256_n_shift_by_31: ___ for($j=0; $j<2; $j++) { my $f0 = $f0; $f0 = $g0 if ($j); my @acc = @acc; @acc = @acc[4..7] if ($j); my $k = 8*4*$j; $code.=<<___; ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s) ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|) eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|) sub @t[6], @t[6], @t[5] eor @acc[1], @acc[1], @t[5] adds @acc[0], @acc[0], @t[5], lsr#63 eor @acc[2], @acc[2], @t[5] adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], @t[5] umulh @t[0], @acc[0], @t[6] adcs @acc[2], @acc[2], xzr umulh @t[1], @acc[1], @t[6] adc @acc[3], @acc[3], xzr umulh @t[2], @acc[2], @t[6] and @t[5], @t[5], @t[6] umulh @t[3+$j], @acc[3], @t[6] neg @t[5], @t[5] mul @acc[0], @acc[0], @t[6] mul @acc[1], @acc[1], @t[6] mul @acc[2], @acc[2], @t[6] adds @acc[1], @acc[1], @t[0] mul @acc[3], @acc[3], @t[6] adcs @acc[2], @acc[2], @t[1] adcs @acc[3], @acc[3], @t[2] adc @t[3+$j], @t[3+$j], @t[5] ___ } $code.=<<___; adds @acc[0], @acc[0], @acc[4] adcs @acc[1], @acc[1], @acc[5] adcs @acc[2], @acc[2], @acc[6] adcs @acc[3], @acc[3], @acc[7] adc @acc[4], @t[3], @t[4] extr @acc[0], @acc[1], @acc[0], #31 extr @acc[1], @acc[2], @acc[1], #31 extr @acc[2], @acc[3], @acc[2], #31 asr @t[4], @acc[4], #63 // result's sign as mask extr @acc[3], @acc[4], @acc[3], #31 eor @acc[0], @acc[0], @t[4] // ensure the result is positive eor @acc[1], @acc[1], @t[4] adds @acc[0], @acc[0], @t[4], lsr#63 eor @acc[2], @acc[2], @t[4] adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], @t[4] adcs @acc[2], @acc[2], xzr stp @acc[0], @acc[1], [$out_ptr,#8*0] adc @acc[3], @acc[3], xzr stp @acc[2], @acc[3], [$out_ptr,#8*2] eor $f0, $f0, @t[4] // adjust |f/g| accordingly eor $g0, $g0, @t[4] sub $f0, $f0, @t[4] sub $g0, $g0, @t[4] ret .size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 ___ { my @a = @acc[0..3]; my @b = @acc[4..7]; my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]); $code.=<<___; .type __ab_approximation_31_256, %function .align 4 __ab_approximation_31_256: ldp @a[2], @a[3], [$in_ptr,#8*2] ldp @b[2], @b[3], [$in_ptr,#8*6] ldp @a[0], @a[1], [$in_ptr,#8*0] ldp @b[0], @b[1], [$in_ptr,#8*4] .Lab_approximation_31_256_loaded: orr @t[0], @a[3], @b[3] // check top-most limbs, ... cmp @t[0], #0 csel @a[3], @a[3], @a[2], ne csel @b[3], @b[3], @b[2], ne csel @a[2], @a[2], @a[1], ne orr @t[0], @a[3], @b[3] // and ones before top-most, ... csel @b[2], @b[2], @b[1], ne cmp @t[0], #0 csel @a[3], @a[3], @a[2], ne csel @b[3], @b[3], @b[2], ne csel @a[2], @a[2], @a[0], ne orr @t[0], @a[3], @b[3] // and one more, ... csel @b[2], @b[2], @b[0], ne clz @t[0], @t[0] cmp @t[0], #64 csel @t[0], @t[0], xzr, ne csel @a[3], @a[3], @a[2], ne csel @b[3], @b[3], @b[2], ne neg @t[1], @t[0] lslv @a[3], @a[3], @t[0] // align high limbs to the left lslv @b[3], @b[3], @t[0] lsrv @a[2], @a[2], @t[1] lsrv @b[2], @b[2], @t[1] and @a[2], @a[2], @t[1], asr#6 and @b[2], @b[2], @t[1], asr#6 orr $a_lo, @a[3], @a[2] orr $b_lo, @b[3], @b[2] bfxil $a_lo, @a[0], #0, #31 bfxil $b_lo, @b[0], #0, #31 b __inner_loop_31_256 ret .size __ab_approximation_31_256,.-__ab_approximation_31_256 .type __inner_loop_31_256, %function .align 4 __inner_loop_31_256: mov $cnt, #31 mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov $bias,#0x7FFFFFFF7FFFFFFF .Loop_31_256: sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting sub $cnt, $cnt, #1 and @t[0], $b_lo, @t[3] sub @t[1], $b_lo, $a_lo // |b_|-|a_| subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) mov @t[0], $fg1 csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| csel $fg0, $fg0, @t[0], hs lsr $a_lo, $a_lo, #1 and @t[0], $fg1, @t[3] and @t[1], $bias, @t[3] sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) add $fg1, $fg1, $fg1 // |f1|<<=1 add $fg0, $fg0, @t[1] sub $fg1, $fg1, $bias cbnz $cnt, .Loop_31_256 mov $bias, #0x7FFFFFFF ubfx $f0, $fg0, #0, #32 ubfx $g0, $fg0, #32, #32 ubfx $f1, $fg1, #0, #32 ubfx $g1, $fg1, #32, #32 sub $f0, $f0, $bias // remove bias sub $g0, $g0, $bias sub $f1, $f1, $bias sub $g1, $g1, $bias ret .size __inner_loop_31_256,.-__inner_loop_31_256 .type __inner_loop_62_256, %function .align 4 __inner_loop_62_256: mov $f0, #1 // |f0|=1 mov $g0, #0 // |g0|=0 mov $f1, #0 // |f1|=0 mov $g1, #1 // |g1|=1 .Loop_62_256: sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting sub $cnt, $cnt, #1 and @t[0], $b_lo, @t[3] sub @t[1], $b_lo, $a_lo // |b_|-|a_| subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) mov @t[0], $f0 csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| mov @t[1], $g0 csel $f0, $f0, $f1, hs // exchange |f0| and |f1| csel $f1, $f1, @t[0], hs csel $g0, $g0, $g1, hs // exchange |g0| and |g1| csel $g1, $g1, @t[1], hs lsr $a_lo, $a_lo, #1 and @t[0], $f1, @t[3] and @t[1], $g1, @t[3] add $f1, $f1, $f1 // |f1|<<=1 add $g1, $g1, $g1 // |g1|<<=1 sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) cbnz $cnt, .Loop_62_256 ret .size __inner_loop_62_256,.-__inner_loop_62_256 ___ } foreach(split("\n",$code)) { s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/; print $_,"\n"; } close STDOUT; ================================================ FILE: src/asm/ct_inverse_mod_256-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast Euclidean inversion as suggested in # https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake. # # void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, # const vec256 modx); # $python_ref.=<<'___'; def ct_inverse_mod_256(inp, mod): a, u = inp, 1 b, v = mod, 0 k = 31 mask = (1 << k) - 1 for i in range(0, 512 // k - 1): # __ab_approximation_31 n = max(a.bit_length(), b.bit_length()) if n < 64: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-k-2)) << k) b_ = (b & mask) | ((b >> (n-k-2)) << k) # __inner_loop_31 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 # __smulq_256_n_shift_by_31 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if a < 0: a, f0, g0 = -a, -f0, -g0 if b < 0: b, f1, g1 = -b, -f1, -g1 # __smulq_512x63 u, v = u*f0 + v*g0, u*f1 + v*g1 if 512 % k + k: f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, 512 % k + k): if a & 1: if a < b: a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 a, f0, g0 = a-b, f0-f1, g0-g1 a, f1, g1 = a >> 1, f1 << 1, g1 << 1 v = u*f1 + v*g1 mod <<= 512 - mod.bit_length() # align to the left if v < 0: v += mod if v < 0: v += mod elif v == 1<<512: v -= mod return v & (2**512 - 1) # to be reduced % mod ___ $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); my @acc = map("%r$_",(8..15)); my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); my $cnt = "%edx"; $frame = 8*6+2*512; $code.=<<___; .text .globl ct_inverse_mod_256 .hidden ct_inverse_mod_256 .type ct_inverse_mod_256,\@function,4,"unwind" .align 32 ct_inverse_mod_256: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot and \$-512, %rax # in the frame... mov $out_ptr, 8*4(%rsp) mov $nx_ptr, 8*5(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($in_ptr), @acc[0] # load input mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*0($n_ptr), @acc[4] # load modulus mov 8*1($n_ptr), @acc[5] mov 8*2($n_ptr), @acc[6] mov 8*3($n_ptr), @acc[7] mov @acc[0], 8*0(%rax) # copy input to |a| mov @acc[1], 8*1(%rax) mov @acc[2], 8*2(%rax) mov @acc[3], 8*3(%rax) mov @acc[4], 8*4(%rax) # copy modulus to |b| mov @acc[5], 8*5(%rax) mov @acc[6], 8*6(%rax) mov @acc[7], 8*7(%rax) mov %rax, $in_ptr ################################# first iteration mov \$31, $cnt call __ab_approximation_31_256 #mov $f0, 8*0(%rsp) #mov $g0, 8*1(%rsp) mov $f1, 8*2(%rsp) mov $g1, 8*3(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulq_256_n_shift_by_31 #mov $f0, 8*0(%rsp) # corrected |f0| #mov $g0, 8*1(%rsp) # corrected |g0| mov $f0, 8*8($out_ptr) # initialize |u| with |f0| mov 8*2(%rsp), $f0 # |f1| mov 8*3(%rsp), $g0 # |g1| lea 8*4($out_ptr), $out_ptr # pointer to destination |b| call __smulq_256_n_shift_by_31 #mov $f0, 8*2(%rsp) # corrected |f1| #mov $g0, 8*3(%rsp) # corrected |g1| mov $f0, 8*9($out_ptr) # initialize |v| with |f1| ################################# second iteration xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$31, $cnt call __ab_approximation_31_256 #mov $f0, 8*0(%rsp) #mov $g0, 8*1(%rsp) mov $f1, 8*2(%rsp) mov $g1, 8*3(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulq_256_n_shift_by_31 mov $f0, 8*0(%rsp) # corrected |f0| mov $g0, 8*1(%rsp) # corrected |g0| mov 8*2(%rsp), $f0 # |f1| mov 8*3(%rsp), $g0 # |g1| lea 8*4($out_ptr), $out_ptr # pointer to destination |b| call __smulq_256_n_shift_by_31 #mov $f0, 8*2(%rsp) # corrected |f1| #mov $g0, 8*3(%rsp) # corrected |g1| mov 8*8($in_ptr), @acc[0] # |u| mov 8*13($in_ptr), @acc[4] # |v| mov @acc[0], @acc[1] imulq 8*0(%rsp), @acc[0] # |u|*|f0| mov @acc[4], @acc[5] imulq 8*1(%rsp), @acc[4] # |v|*|g0| add @acc[4], @acc[0] mov @acc[0], 8*4($out_ptr) # destination |u| sar \$63, @acc[0] # sign extension mov @acc[0], 8*5($out_ptr) mov @acc[0], 8*6($out_ptr) mov @acc[0], 8*7($out_ptr) mov @acc[0], 8*8($out_ptr) lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor imulq $f0, @acc[1] # |u|*|f1| imulq $g0, @acc[5] # |v|*|g1| add @acc[5], @acc[1] mov @acc[1], 8*9($out_ptr) # destination |v| sar \$63, @acc[1] # sign extension mov @acc[1], 8*10($out_ptr) mov @acc[1], 8*11($out_ptr) mov @acc[1], 8*12($out_ptr) mov @acc[1], 8*13($out_ptr) ___ for($i=2; $i<15; $i++) { my $smul_512x63 = $i>8 ? "__smulq_512x63" : "__smulq_256x63"; $code.=<<___; xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$31, $cnt call __ab_approximation_31_256 #mov $f0, 8*0(%rsp) #mov $g0, 8*1(%rsp) mov $f1, 8*2(%rsp) mov $g1, 8*3(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulq_256_n_shift_by_31 mov $f0, 8*0(%rsp) # corrected |f0| mov $g0, 8*1(%rsp) # corrected |g0| mov 8*2(%rsp), $f0 # |f1| mov 8*3(%rsp), $g0 # |g1| lea 8*4($out_ptr), $out_ptr # pointer to destination |b| call __smulq_256_n_shift_by_31 mov $f0, 8*2(%rsp) # corrected |f1| mov $g0, 8*3(%rsp) # corrected |g1| mov 8*0(%rsp), $f0 # |f0| mov 8*1(%rsp), $g0 # |g0| lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| lea 8*4($out_ptr), $out_ptr # pointer to destination |u| call __smulq_256x63 mov 8*2(%rsp), $f0 # |f1| mov 8*3(%rsp), $g0 # |g1| lea 8*5($out_ptr),$out_ptr # pointer to destination |v| call $smul_512x63 ___ $code.=<<___ if ($i==8); sar \$63, %rbp # sign extension mov %rbp, 8*5($out_ptr) mov %rbp, 8*6($out_ptr) mov %rbp, 8*7($out_ptr) ___ } $code.=<<___; ################################# two[!] last iterations in one go xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$47, $cnt # 31 + 512 % 31 #call __ab_approximation_31 # |a| and |b| are exact, just load mov 8*0($in_ptr), @acc[0] # |a_lo| #xor @acc[1], @acc[1] # |a_hi| mov 8*4($in_ptr), @acc[2] # |b_lo| #xor @acc[3], @acc[3] # |b_hi| call __inner_loop_62_256 #mov $f0, 8*0(%rsp) #mov $g0, 8*1(%rsp) #mov $f1, 8*2(%rsp) #mov $g1, 8*3(%rsp) #mov 8*0(%rsp), $f0 # |f0| #mov 8*1(%rsp), $g0 # |g0| lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| #call __smulq_256x63 #mov 8*2(%rsp), $f0 # |f1| #mov 8*3(%rsp), $g0 # |g1| mov $f1, $f0 mov $g1, $g0 mov 8*4(%rsp), $out_ptr # original |out_ptr| call __smulq_512x63 adc %rbp, %rdx # the excess limb of the result mov 8*5(%rsp), $in_ptr # original |nx_ptr| mov %rdx, %rax sar \$63, %rdx # result's sign as mask mov %rdx, @acc[0] # mask |modulus| mov %rdx, @acc[1] #ifdef __SGX_LVI_HARDENING__ lfence #endif and 8*0($in_ptr), @acc[0] mov %rdx, @acc[2] and 8*1($in_ptr), @acc[1] and 8*2($in_ptr), @acc[2] and 8*3($in_ptr), %rdx add @acc[0], @acc[4] # conditionally add |modulus|<<256 adc @acc[1], @acc[5] adc @acc[2], @acc[6] adc %rdx, @acc[7] adc \$0, %rax mov %rax, %rdx neg %rax or %rax, %rdx # excess bit or sign as mask sar \$63, %rax # excess bit as mask mov %rdx, @acc[0] # mask |modulus| mov %rdx, @acc[1] and 8*0($in_ptr), @acc[0] mov %rdx, @acc[2] and 8*1($in_ptr), @acc[1] and 8*2($in_ptr), @acc[2] and 8*3($in_ptr), %rdx xor %rax, @acc[0] # conditionally negate |modulus| xor %rcx, %rcx xor %rax, @acc[1] sub %rax, %rcx xor %rax, @acc[2] xor %rax, %rdx add %rcx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, %rdx add @acc[0], @acc[4] # final adjustment for |modulus|<<256 adc @acc[1], @acc[5] adc @acc[2], @acc[6] adc %rdx, @acc[7] mov @acc[4], 8*4($out_ptr) # store absolute value mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) mov @acc[7], 8*7($out_ptr) lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size ct_inverse_mod_256,.-ct_inverse_mod_256 ___ ######################################################################## # Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers # to the maximum bit-length of the *result*, and "63" - to the maximum # bit-length of the |f?| and |g?| single-limb multiplicands. However! # The latter should not be taken literally, as they are always chosen so # that "bad things" don't happen. For example, there comes a point when # |v| grows beyond 256 bits, while |u| remains 256 bits wide. Yet, we # always call __smulq_256x63 to perform |u|*|f0|+|v|*|g0| step. This is # because past that point |f0| is always 1 and |g0| is always 0. And, # since |u| never grows beyond 256 bits, __smulq_512x63 doesn't have to # perform full-width |u|*|f1| multiplication, half-width one with sign # extension is sufficient... $code.=<<___; .type __smulq_512x63,\@abi-omnipotent .align 32 __smulq_512x63: mov 8*0($in_ptr), @acc[0] # load |u| mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), %rbp # sign limb mov $f0, %rbx sar \$63, $f0 # |f0|'s sign as mask xor %rax, %rax sub $f0, %rax # |f0|'s sign as bit xor $f0, %rbx # conditionally negate |f0| add %rax, %rbx xor $f0, @acc[0] # conditionally negate |u| xor $f0, @acc[1] xor $f0, @acc[2] xor $f0, @acc[3] xor $f0, %rbp add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, %rbp mulq %rbx # |u|*|f0| mov %rax, 8*0($out_ptr) # offload |u|*|f0| mov @acc[1], %rax mov %rdx, @acc[1] ___ for($i=1; $i<3; $i++) { $code.=<<___; mulq %rbx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov @acc[$i], 8*$i($out_ptr) mov %rdx, @acc[$i+1] ___ } $code.=<<___; and %rbx, %rbp neg %rbp mulq %rbx add %rax, @acc[3] adc %rdx, %rbp mov @acc[3], 8*3($out_ptr) mov 8*5($in_ptr), @acc[0] # load |v| mov 8*6($in_ptr), @acc[1] mov 8*7($in_ptr), @acc[2] mov 8*8($in_ptr), @acc[3] mov 8*9($in_ptr), @acc[4] mov 8*10($in_ptr), @acc[5] mov 8*11($in_ptr), @acc[6] mov 8*12($in_ptr), @acc[7] mov $g0, $f0 sar \$63, $f0 # |g0|'s sign as mask xor %rax, %rax sub $f0, %rax # |g0|'s sign as bit xor $f0, $g0 # conditionally negate |g0| add %rax, $g0 xor $f0, @acc[0] # conditionally negate |v| xor $f0, @acc[1] xor $f0, @acc[2] xor $f0, @acc[3] xor $f0, @acc[4] xor $f0, @acc[5] xor $f0, @acc[6] xor $f0, @acc[7] add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] adc \$0, @acc[7] mulq $g0 mov %rax, @acc[0] mov @acc[1], %rax mov %rdx, @acc[1] ___ for($i=1; $i<7; $i++) { $code.=<<___; mulq $g0 add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___; imulq $g0 add %rax, @acc[7] adc \$0, %rdx # used in the final step mov %rbp, %rbx sar \$63, %rbp # sign extension add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc %rbx, @acc[4] adc %rbp, @acc[5] adc %rbp, @acc[6] adc %rbp, @acc[7] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) mov @acc[7], 8*7($out_ptr) ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] .size __smulq_512x63,.-__smulq_512x63 .type __smulq_256x63,\@abi-omnipotent .align 32 __smulq_256x63: ___ for($j=0; $j<2; $j++) { my $k = 8*5*$j; my @acc=@acc; @acc=@acc[4..7] if($j); my $top="%rbp"; $top=$g0 if($j); $code.=<<___; mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) mov $k+8*1($in_ptr), @acc[1] mov $k+8*2($in_ptr), @acc[2] mov $k+8*3($in_ptr), @acc[3] mov $k+8*4($in_ptr), $top # sign/excess limb mov $f0, %rbx sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) xor %rax, %rax sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) xor $f0, %rbx # conditionally negate |f0| add %rax, %rbx xor $f0, @acc[0] # conditionally negate |u| (or |v|) xor $f0, @acc[1] xor $f0, @acc[2] xor $f0, @acc[3] xor $f0, $top add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, $top mulq %rbx mov %rax, @acc[0] mov @acc[1], %rax mov %rdx, @acc[1] ___ for($i=1; $i<3; $i++) { $code.=<<___; mulq %rbx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___; and %rbx, $top neg $top mulq %rbx add %rax, @acc[3] adc %rdx, $top ___ $code.=<<___ if ($j==0); mov $g0, $f0 ___ } $code.=<<___; add @acc[4], @acc[0] # accumulate |u|*|f0| adc @acc[5], @acc[1] adc @acc[6], @acc[2] adc @acc[7], @acc[3] adc %rcx, %rbp mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov %rbp, 8*4($out_ptr) ret .size __smulq_256x63,.-__smulq_256x63 ___ ######################################################################## # Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of # the names refers to maximum bit-lengths of |a| and |b|. As already # mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always # chosen so that "bad things" don't happen. For example, so that the # sum of the products doesn't overflow, and that the final result is # never wider than inputs... { $code.=<<___; .type __smulq_256_n_shift_by_31,\@abi-omnipotent .align 32 __smulq_256_n_shift_by_31: mov $f0, 8*0($out_ptr) # offload |f0| mov $g0, 8*1($out_ptr) # offload |g0| mov $f0, %rbp ___ for($j=0; $j<2; $j++) { my $k = 8*4*$j; my @acc=@acc; @acc=@acc[4..7] if ($j); my $f0="%rbp"; $f0=$g0 if ($j); $code.=<<___; mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) mov $k+8*1($in_ptr), @acc[1] mov $k+8*2($in_ptr), @acc[2] mov $k+8*3($in_ptr), @acc[3] mov $f0, %rbx sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) xor %rax, %rax sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) xor $f0, %rbx # conditionally negate |f0| (or |g0|) add %rax, %rbx xor $f0, @acc[0] # conditionally negate |a| (or |b|) xor $f0, @acc[1] xor $f0, @acc[2] xor $f0, @acc[3] add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] mulq %rbx mov %rax, @acc[0] mov @acc[1], %rax and %rbx, $f0 neg $f0 mov %rdx, @acc[1] ___ for($i=1; $i<3; $i++) { $code.=<<___; mulq %rbx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___; mulq %rbx add %rax, @acc[3] adc %rdx, $f0 ___ } $code.=<<___; add @acc[4], @acc[0] adc @acc[5], @acc[1] adc @acc[6], @acc[2] adc @acc[7], @acc[3] adc $g0, %rbp mov 8*0($out_ptr), $f0 # restore original |f0| mov 8*1($out_ptr), $g0 # restore original |g0| shrd \$31, @acc[1], @acc[0] shrd \$31, @acc[2], @acc[1] shrd \$31, @acc[3], @acc[2] shrd \$31, %rbp, @acc[3] sar \$63, %rbp # sign as mask xor %rax, %rax sub %rbp, %rax # sign as bit xor %rbp, @acc[0] # conditionally negate the result xor %rbp, @acc[1] xor %rbp, @acc[2] xor %rbp, @acc[3] add %rax, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) xor %rbp, $f0 # conditionally negate |f0| xor %rbp, $g0 # conditionally negate |g0| add %rax, $f0 add %rax, $g0 ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] .size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 ___ } { my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); my ($a_, $b_) = ($a_lo, $b_lo); { my @a = ($a_lo, $t1, $a_hi); my @b = ($b_lo, $t2, $b_hi); $code.=<<___; .type __ab_approximation_31_256,\@abi-omnipotent .align 32 __ab_approximation_31_256: mov 8*3($in_ptr), @a[2] # load |a| in reverse order mov 8*7($in_ptr), @b[2] # load |b| in reverse order mov 8*2($in_ptr), @a[1] mov 8*6($in_ptr), @b[1] mov 8*1($in_ptr), @a[0] mov 8*5($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # check top-most limbs, ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] mov 8*0($in_ptr), @a[0] cmovz @b[0], @b[1] mov 8*4($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # ... and ones before that ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] cmovz @b[0], @b[1] mov @a[2], $t0 or @b[2], $t0 bsr $t0, %rcx lea 1(%rcx), %rcx cmovz @a[0], @a[2] cmovz @b[0], @b[2] cmovz $t0, %rcx neg %rcx #and \$63, %rcx # debugging artefact shldq %cl, @a[1], @a[2] # align second limb to the left shldq %cl, @b[1], @b[2] mov \$0x7FFFFFFF, %eax and %rax, @a[0] and %rax, @b[0] not %rax and %rax, @a[2] and %rax, @b[2] or @a[2], @a[0] or @b[2], @b[0] jmp __inner_loop_31_256 ret .size __ab_approximation_31_256,.-__ab_approximation_31_256 ___ } $code.=<<___; .type __inner_loop_31_256,\@abi-omnipotent .align 32 # comment and punish Coffee Lake by up to 40% __inner_loop_31_256: ################# by Thomas Pornin mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 mov \$0x7FFFFFFF7FFFFFFF, $bias .Loop_31_256: cmp $b_, $a_ # if |a_|<|b_|, swap the variables mov $a_, $t0 mov $b_, $t1 mov $fg0, $t2 mov $fg1, $t3 cmovb $b_, $a_ cmovb $t0, $b_ cmovb $fg1, $fg0 cmovb $t2, $fg1 sub $b_, $a_ # |a_|-|b_| sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| add $bias, $fg0 test \$1, $t0 # if |a_| was even, roll back cmovz $t0, $a_ cmovz $t1, $b_ cmovz $t2, $fg0 cmovz $t3, $fg1 shr \$1, $a_ # |a_|>>=1 add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 sub $bias, $fg1 sub \$1, $cnt jnz .Loop_31_256 shr \$32, $bias mov %ecx, %edx # $fg0, $f0 mov ${fg1}d, ${f1}d shr \$32, $g0 shr \$32, $g1 sub $bias, $f0 # remove the bias sub $bias, $g0 sub $bias, $f1 sub $bias, $g1 ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo .size __inner_loop_31_256,.-__inner_loop_31_256 .type __inner_loop_62_256,\@abi-omnipotent .align 32 __inner_loop_62_256: mov $cnt, %r15d mov \$1, $f0 # |f0|=1 xor $g0, $g0 # |g0|=0 xor $f1, $f1 # |f1|=0 mov $f0, $g1 # |g1|=1 mov $f0, %r14 .Loop_62_256: xor $t0, $t0 test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_| mov $b_lo, $t1 cmovnz $b_lo, $t0 sub $a_lo, $t1 # |b_|-|a_| mov $a_lo, $t2 sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| cmovc $t2, $b_lo # |b_| = |a_| mov $f0, $t0 # exchange |f0| and |f1| cmovc $f1, $f0 cmovc $t0, $f1 mov $g0, $t1 # exchange |g0| and |g1| cmovc $g1, $g0 cmovc $t1, $g1 xor $t0, $t0 xor $t1, $t1 shr \$1, $a_lo test %r14, $t2 # if |a_| was odd, then we'll be subtracting... cmovnz $f1, $t0 cmovnz $g1, $t1 add $f1, $f1 # |f1|<<=1 add $g1, $g1 # |g1|<<=1 sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) sub \$1, %r15d jnz .Loop_62_256 ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo .size __inner_loop_62_256,.-__inner_loop_62_256 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/ct_inverse_mod_384-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast Euclidean inversion as suggested in # https://eprint.iacr.org/2020/972. Performance is >12x better [on # Cortex cores] than modulus-specific FLT addition chain... # # void ct_inverse_mod_384(vec768 ret, const vec384 inp, const vec384 mod); # $python_ref.=<<'___'; def ct_inverse_mod_384(inp, mod): a, u = inp, 1 b, v = mod, 0 k = 62 w = 64 mask = (1 << w) - 1 for i in range(0, 768 // k): # __ab_approximation_62 n = max(a.bit_length(), b.bit_length()) if n < 128: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-w)) << w) b_ = (b & mask) | ((b >> (n-w)) << w) # __inner_loop_62 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 # __smul_384_n_shift_by_62 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if a < 0: a, f0, g0 = -a, -f0, -g0 if b < 0: b, f1, g1 = -b, -f1, -g1 # __smul_768x63 u, v = u*f0 + v*g0, u*f1 + v*g1 if 768 % k: f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, 768 % k): if a & 1: if a < b: a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 a, f0, g0 = a-b, f0-f1, g0-g1 a, f1, g1 = a >> 1, f1 << 1, g1 << 1 v = u*f1 + v*g1 mod <<= 768 - mod.bit_length() # align to the left if v < 0: v += mod if v < 0: v += mod elif v == 1<<768: v -= mod return v & (2**768 - 1) # to be reduced % mod ___ $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); my @acc=map("x$_",(3..14)); my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21)); my $cnt = $n_ptr; my @t = map("x$_",(22..28,2)); my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11]; $frame = 32+2*512; $code.=<<___; .text .globl ct_inverse_mod_384 .hidden ct_inverse_mod_384 .type ct_inverse_mod_384, %function .align 5 ct_inverse_mod_384: paciasp stp c29, c30, [csp,#-16*__SIZEOF_POINTER__]! add c29, csp, #0 stp c19, c20, [csp,#2*__SIZEOF_POINTER__] stp c21, c22, [csp,#4*__SIZEOF_POINTER__] stp c23, c24, [csp,#6*__SIZEOF_POINTER__] stp c25, c26, [csp,#8*__SIZEOF_POINTER__] stp c27, c28, [csp,#10*__SIZEOF_POINTER__] sub csp, csp, #$frame ldp @t[0], @acc[1], [$in_ptr,#8*0] ldp @acc[2], @acc[3], [$in_ptr,#8*2] ldp @acc[4], @acc[5], [$in_ptr,#8*4] #ifdef __CHERI_PURE_CAPABILITY__ cadd $in_ptr, csp, #32+511 alignd $in_ptr, $in_ptr, #9 scbnds $in_ptr, $in_ptr, #512 #else add $in_ptr, sp, #32+511 // find closest 512-byte-aligned spot and $in_ptr, $in_ptr, #-512 // in the frame... #endif stp c0, c3, [csp] // offload out_ptr, nx_ptr ldp @acc[6], @acc[7], [$n_ptr,#8*0] ldp @acc[8], @acc[9], [$n_ptr,#8*2] ldp @acc[10], @acc[11], [$n_ptr,#8*4] stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| stp @acc[2], @acc[3], [$in_ptr,#8*2] stp @acc[4], @acc[5], [$in_ptr,#8*4] stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b| stp @acc[8], @acc[9], [$in_ptr,#8*8] stp @acc[10], @acc[11], [$in_ptr,#8*10] ////////////////////////////////////////// first iteration mov $cnt, #62 bl .Lab_approximation_62_loaded eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, $in_ptr, $out_ptr #endif bl __smul_384_n_shift_by_62 str $f0,[$out_ptr,#8*12] // initialize |u| with |f0| mov $f0, $f1 // |f1| mov $g0, $g1 // |g1| cadd $out_ptr, $out_ptr, #8*6 // pointer to dst |b| bl __smul_384_n_shift_by_62 str $f0, [$out_ptr,#8*14] // initialize |v| with |f1| ////////////////////////////////////////// second iteration eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, $out_ptr, $in_ptr #endif mov $cnt, #62 bl __ab_approximation_62 eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, $in_ptr, $out_ptr #endif bl __smul_384_n_shift_by_62 mov $f_, $f0 // corrected |f0| mov $g_, $g0 // corrected |g0| mov $f0, $f1 // |f1| mov $g0, $g1 // |g1| cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |b| bl __smul_384_n_shift_by_62 ldr @acc[4], [$in_ptr,#8*12] // |u| ldr @acc[5], [$in_ptr,#8*20] // |v| mul @acc[0], $f_, @acc[4] // |u|*|f0| smulh @acc[1], $f_, @acc[4] mul @acc[2], $g_, @acc[5] // |v|*|g0| smulh @acc[3], $g_, @acc[5] adds @acc[0], @acc[0], @acc[2] adc @acc[1], @acc[1], @acc[3] stp @acc[0], @acc[1], [$out_ptr,#8*6] asr @acc[2], @acc[1], #63 // sign extension stp @acc[2], @acc[2], [$out_ptr,#8*8] stp @acc[2], @acc[2], [$out_ptr,#8*10] mul @acc[0], $f0, @acc[4] // |u|*|f1| smulh @acc[1], $f0, @acc[4] mul @acc[2], $g0, @acc[5] // |v|*|g1| smulh @acc[3], $g0, @acc[5] adds @acc[0], @acc[0], @acc[2] adc @acc[1], @acc[1], @acc[3] stp @acc[0], @acc[1], [$out_ptr,#8*14] asr @acc[2], @acc[1], #63 // sign extension stp @acc[2], @acc[2], [$out_ptr,#8*16] stp @acc[2], @acc[2], [$out_ptr,#8*18] ___ for($i=2; $i<11; $i++) { $code.=<<___; eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, $out_ptr, $in_ptr #endif mov $cnt, #62 bl __ab_approximation_62 eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, $in_ptr, $out_ptr #endif bl __smul_384_n_shift_by_62 mov $f_, $f0 // corrected |f0| mov $g_, $g0 // corrected |g0| mov $f0, $f1 // |f1| mov $g0, $g1 // |g1| cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |b| bl __smul_384_n_shift_by_62 cadd $out_ptr, $out_ptr, #8*6 // pointer to destination |u| bl __smul_384x63 ___ $code.=<<___ if ($i==5); asr @t[5], @t[5], #63 str @t[5], [$out_ptr,#8*6] ___ $code.=<<___ if ($i>5); adc @t[3], @t[3], @t[4] str @t[3], [$out_ptr,#8*6] ___ $code.=<<___; mov $f_, $f0 // corrected |f1| mov $g_, $g0 // corrected |g1| cadd $out_ptr, $out_ptr, #8*8 // pointer to destination |v| bl __smul_384x63 ___ $code.=<<___ if ($i>5); bl __smul_768x63_tail ___ $code.=<<___ if ($i==5); asr @t[5], @t[5], #63 // sign extension stp @t[5], @t[5], [$out_ptr,#8*6] stp @t[5], @t[5], [$out_ptr,#8*8] stp @t[5], @t[5], [$out_ptr,#8*10] ___ } $code.=<<___; ////////////////////////////////////////// iteration before last eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, $out_ptr, $in_ptr #endif mov $cnt, #62 //bl __ab_approximation_62 // |a| and |b| are exact, ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load ldp $b_lo, $b_hi, [$in_ptr,#8*6] bl __inner_loop_62 eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, $in_ptr, $out_ptr #endif str $a_lo, [$out_ptr,#8*0] str $b_lo, [$out_ptr,#8*6] mov $f_, $f0 // exact |f0| mov $g_, $g0 // exact |g0| mov $f0, $f1 mov $g0, $g1 cadd $out_ptr, $out_ptr, #8*12 // pointer to dst |u| bl __smul_384x63 adc @t[3], @t[3], @t[4] str @t[3], [$out_ptr,#8*6] mov $f_, $f0 // exact |f1| mov $g_, $g0 // exact |g1| cadd $out_ptr, $out_ptr, #8*8 // pointer to dst |v| bl __smul_384x63 bl __smul_768x63_tail ////////////////////////////////////////// last iteration eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, $out_ptr, $in_ptr #endif mov $cnt, #24 // 768 % 62 //bl __ab_approximation_62 // |a| and |b| are exact, ldr $a_lo, [$in_ptr,#8*0] // just load eor $a_hi, $a_hi, $a_hi ldr $b_lo, [$in_ptr,#8*6] eor $b_hi, $b_hi, $b_hi bl __inner_loop_62 mov $f_, $f1 mov $g_, $g1 ldp c0, c15, [csp] // original out_ptr and n_ptr bl __smul_384x63 bl __smul_768x63_tail ldr c30, [c29,#__SIZEOF_POINTER__] smulh @t[1], @acc[5], $g_ // figure out top-most limb adc @t[4], @t[4], @t[6] ldp @acc[6], @acc[7], [$f0,#8*0] // load |mod| add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1 ldp @acc[8], @acc[9], [$f0,#8*2] asr @t[0], @t[1], #63 // sign as mask ldp @acc[10], @acc[11], [$f0,#8*4] and @t[4], @acc[6], @t[0] // add mod<<384 conditionally and @t[5], @acc[7], @t[0] adds @acc[0], @acc[0], @t[4] and @t[6], @acc[8], @t[0] adcs @acc[1], @acc[1], @t[5] and @t[7], @acc[9], @t[0] adcs @acc[2], @acc[2], @t[6] and @t[4], @acc[10], @t[0] adcs @acc[3], @acc[3], @t[7] and @t[5], @acc[11], @t[0] adcs @acc[4], @acc[4], @t[4] adcs @acc[5], @t[3], @t[5] adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1 neg @t[0], @t[1] orr @t[1], @t[1], @t[0] // excess bit or sign as mask asr @t[0], @t[0], #63 // excess bit as mask and @acc[6], @acc[6], @t[1] // mask |mod| and @acc[7], @acc[7], @t[1] and @acc[8], @acc[8], @t[1] and @acc[9], @acc[9], @t[1] and @acc[10], @acc[10], @t[1] and @acc[11], @acc[11], @t[1] eor @acc[6], @acc[6], @t[0] // conditionally negate |mod| eor @acc[7], @acc[7], @t[0] adds @acc[6], @acc[6], @t[0], lsr#63 eor @acc[8], @acc[8], @t[0] adcs @acc[7], @acc[7], xzr eor @acc[9], @acc[9], @t[0] adcs @acc[8], @acc[8], xzr eor @acc[10], @acc[10], @t[0] adcs @acc[9], @acc[9], xzr eor @acc[11], @acc[11], @t[0] adcs @acc[10], @acc[10], xzr adc @acc[11], @acc[11], xzr adds @acc[0], @acc[0], @acc[6] // final adjustment for |mod|<<384 adcs @acc[1], @acc[1], @acc[7] adcs @acc[2], @acc[2], @acc[8] adcs @acc[3], @acc[3], @acc[9] stp @acc[0], @acc[1], [$out_ptr,#8*6] adcs @acc[4], @acc[4], @acc[10] stp @acc[2], @acc[3], [$out_ptr,#8*8] adc @acc[5], @acc[5], @acc[11] stp @acc[4], @acc[5], [$out_ptr,#8*10] add csp, csp, #$frame ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] ldp c27, c28, [c29,#10*__SIZEOF_POINTER__] ldr c29, [csp],#16*__SIZEOF_POINTER__ autiasp ret .size ct_inverse_mod_384,.-ct_inverse_mod_384 //////////////////////////////////////////////////////////////////////// // see corresponding commentary in ctx_inverse_mod_384-x86_64... .type __smul_384x63, %function .align 5 __smul_384x63: ___ for($j=0; $j<2; $j++) { my $f_ = $f_; $f_ = $g_ if ($j); my @acc = @acc; @acc = @acc[6..11] if ($j); my $k = 8*12+8*8*$j; $code.=<<___; ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) ldr @t[3+$j], [$in_ptr,#8*6+$k] sub $f_, $f_, $f1 eor @acc[1], @acc[1], $f1 adds @acc[0], @acc[0], $f1, lsr#63 eor @acc[2], @acc[2], $f1 adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], $f1 adcs @acc[2], @acc[2], xzr eor @acc[4], @acc[4], $f1 adcs @acc[3], @acc[3], xzr umulh @t[0], @acc[0], $f_ eor @acc[5], @acc[5], $f1 umulh @t[1], @acc[1], $f_ adcs @acc[4], @acc[4], xzr umulh @t[2], @acc[2], $f_ eor @t[3+$j], @t[3+$j], $f1 mul @acc[0], @acc[0], $f_ adcs @acc[5], @acc[5], xzr mul @acc[1], @acc[1], $f_ adcs @t[3+$j], @t[3+$j], xzr ___ $code.=<<___ if ($j); adc $g1, xzr, xzr // used in __smul_768x63_tail ___ $code.=<<___; cmp $f_, #0 mul @acc[2], @acc[2], $f_ csel @t[3+$j], @t[3+$j], xzr, ne adds @acc[1], @acc[1], @t[0] umulh @t[0], @acc[3], $f_ adcs @acc[2], @acc[2], @t[1] umulh @t[1], @acc[4], $f_ mul @acc[3], @acc[3], $f_ mul @acc[4], @acc[4], $f_ adcs @acc[3], @acc[3], @t[2] mul @t[5+$j],@acc[5], $f_ adcs @acc[4], @acc[4], @t[0] adcs @t[5+$j],@t[5+$j],@t[1] ___ $code.=<<___ if ($j==0); adc @t[7], xzr, xzr ___ } $code.=<<___; adc @t[7], @t[7], xzr adds @acc[0], @acc[0], @acc[6] adcs @acc[1], @acc[1], @acc[7] adcs @acc[2], @acc[2], @acc[8] adcs @acc[3], @acc[3], @acc[9] stp @acc[0], @acc[1], [$out_ptr,#8*0] adcs @acc[4], @acc[4], @acc[10] stp @acc[2], @acc[3], [$out_ptr,#8*2] adcs @t[5], @t[5], @t[6] stp @acc[4], @t[5], [$out_ptr,#8*4] ret .size __smul_384x63,.-__smul_384x63 .type __smul_768x63_tail, %function .align 5 __smul_768x63_tail: umulh @t[5], @acc[5], $f_ ldr @acc[1], [$in_ptr,#8*27]// load rest of |v| adc @t[7], @t[7], xzr ldp @acc[2], @acc[3], [$in_ptr,#8*28] and @t[3], @t[3], $f_ ldp @acc[4], @acc[5], [$in_ptr,#8*30] sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain umulh @acc[11], @acc[11], $g_ // resume |v|*|g1| chain eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v| eor @acc[2], @acc[2], $f1 eor @acc[3], @acc[3], $f1 adds @acc[1], @acc[1], $g1 eor @acc[4], @acc[4], $f1 adcs @acc[2], @acc[2], xzr eor @acc[5], @acc[5], $f1 adcs @acc[3], @acc[3], xzr umulh @t[0], @t[4], $g_ adcs @acc[4], @acc[4], xzr umulh @t[1], @acc[1], $g_ adc @acc[5], @acc[5], xzr umulh @t[2], @acc[2], $g_ add @acc[11], @acc[11], @t[7] umulh @t[3], @acc[3], $g_ asr @t[6], @t[5], #63 umulh @t[7], @acc[4], $g_ mul @acc[0], @t[4], $g_ mul @acc[1], @acc[1], $g_ mul @acc[2], @acc[2], $g_ adds @acc[0], @acc[0], @acc[11] mul @acc[3], @acc[3], $g_ adcs @acc[1], @acc[1], @t[0] mul @acc[4], @acc[4], $g_ adcs @acc[2], @acc[2], @t[1] mul @t[0], @acc[5], $g_ adcs @acc[3], @acc[3], @t[2] adcs @acc[4], @acc[4], @t[3] adcs @t[3], @t[0], @t[7] adc @t[4], xzr, xzr // used in the final step adds @acc[0], @acc[0], @t[5] adcs @acc[1], @acc[1], @t[6] adcs @acc[2], @acc[2], @t[6] adcs @acc[3], @acc[3], @t[6] stp @acc[0], @acc[1], [$out_ptr,#8*6] adcs @acc[4], @acc[4], @t[6] stp @acc[2], @acc[3], [$out_ptr,#8*8] adcs @t[3], @t[3], @t[6] // carry is used in the final step stp @acc[4], @t[3], [$out_ptr,#8*10] ret .size __smul_768x63_tail,.-__smul_768x63_tail .type __smul_384_n_shift_by_62, %function .align 5 __smul_384_n_shift_by_62: ___ for($j=0; $j<2; $j++) { my $f0 = $f0; $f0 = $g0 if ($j); my @acc = @acc; @acc = @acc[6..11] if ($j); my $k = 8*6*$j; $code.=<<___; ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s) ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|) ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|) sub @t[7], @t[7], @t[6] eor @acc[1], @acc[1], @t[6] adds @acc[0], @acc[0], @t[6], lsr#63 eor @acc[2], @acc[2], @t[6] adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], @t[6] adcs @acc[2], @acc[2], xzr eor @acc[4], @acc[4], @t[6] umulh @t[0], @acc[0], @t[7] adcs @acc[3], @acc[3], xzr umulh @t[1], @acc[1], @t[7] eor @acc[5], @acc[5], @t[6] mul @acc[0], @acc[0], @t[7] adcs @acc[4], @acc[4], xzr mul @acc[1], @acc[1], @t[7] adc @acc[5], @acc[5], xzr umulh @t[2], @acc[2], @t[7] and @t[6], @t[6], @t[7] umulh @t[3], @acc[3], @t[7] adds @acc[1], @acc[1], @t[0] mul @acc[2], @acc[2], @t[7] umulh @t[0], @acc[4], @t[7] neg @t[6], @t[6] mul @acc[3], @acc[3], @t[7] adcs @acc[2], @acc[2], @t[1] umulh @t[1], @acc[5], @t[7] mul @acc[4], @acc[4], @t[7] adcs @acc[3], @acc[3], @t[2] mul @acc[5], @acc[5], @t[7] adcs @acc[4], @acc[4], @t[3] adcs @acc[5], @acc[5], @t[0] adc @t[5+$j], @t[1], @t[6] ___ } $code.=<<___; adds @acc[0], @acc[0], @acc[6] adcs @acc[1], @acc[1], @acc[7] adcs @acc[2], @acc[2], @acc[8] adcs @acc[3], @acc[3], @acc[9] adcs @acc[4], @acc[4], @acc[10] adcs @acc[5], @acc[5], @acc[11] adc @acc[6], @t[5], @t[6] extr @acc[0], @acc[1], @acc[0], #62 extr @acc[1], @acc[2], @acc[1], #62 extr @acc[2], @acc[3], @acc[2], #62 asr @t[6], @acc[6], #63 extr @acc[3], @acc[4], @acc[3], #62 extr @acc[4], @acc[5], @acc[4], #62 extr @acc[5], @acc[6], @acc[5], #62 eor @acc[0], @acc[0], @t[6] eor @acc[1], @acc[1], @t[6] adds @acc[0], @acc[0], @t[6], lsr#63 eor @acc[2], @acc[2], @t[6] adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], @t[6] adcs @acc[2], @acc[2], xzr eor @acc[4], @acc[4], @t[6] adcs @acc[3], @acc[3], xzr eor @acc[5], @acc[5], @t[6] stp @acc[0], @acc[1], [$out_ptr,#8*0] adcs @acc[4], @acc[4], xzr stp @acc[2], @acc[3], [$out_ptr,#8*2] adc @acc[5], @acc[5], xzr stp @acc[4], @acc[5], [$out_ptr,#8*4] eor $f0, $f0, @t[6] eor $g0, $g0, @t[6] sub $f0, $f0, @t[6] sub $g0, $g0, @t[6] ret .size __smul_384_n_shift_by_62,.-__smul_384_n_shift_by_62 ___ { my @a = @acc[0..5]; my @b = @acc[6..11]; $code.=<<___; .type __ab_approximation_62, %function .align 4 __ab_approximation_62: ldp @a[4], @a[5], [$in_ptr,#8*4] ldp @b[4], @b[5], [$in_ptr,#8*10] ldp @a[2], @a[3], [$in_ptr,#8*2] ldp @b[2], @b[3], [$in_ptr,#8*8] .Lab_approximation_62_loaded: orr @t[0], @a[5], @b[5] // check top-most limbs, ... cmp @t[0], #0 csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne csel @a[4], @a[4], @a[3], ne orr @t[0], @a[5], @b[5] // ... ones before top-most, ... csel @b[4], @b[4], @b[3], ne ldp @a[0], @a[1], [$in_ptr,#8*0] ldp @b[0], @b[1], [$in_ptr,#8*6] cmp @t[0], #0 csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne csel @a[4], @a[4], @a[2], ne orr @t[0], @a[5], @b[5] // ... and ones before that ... csel @b[4], @b[4], @b[2], ne cmp @t[0], #0 csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne csel @a[4], @a[4], @a[1], ne orr @t[0], @a[5], @b[5] csel @b[4], @b[4], @b[1], ne clz @t[0], @t[0] cmp @t[0], #64 csel @t[0], @t[0], xzr, ne csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne neg @t[1], @t[0] lslv @a[5], @a[5], @t[0] // align high limbs to the left lslv @b[5], @b[5], @t[0] lsrv @a[4], @a[4], @t[1] lsrv @b[4], @b[4], @t[1] and @a[4], @a[4], @t[1], asr#6 and @b[4], @b[4], @t[1], asr#6 orr @a[5], @a[5], @a[4] orr @b[5], @b[5], @b[4] b __inner_loop_62 ret .size __ab_approximation_62,.-__ab_approximation_62 ___ } $code.=<<___; .type __inner_loop_62, %function .align 4 __inner_loop_62: mov $f0, #1 // |f0|=1 mov $g0, #0 // |g0|=0 mov $f1, #0 // |f1|=0 mov $g1, #1 // |g1|=1 .Loop_62: sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting sub $cnt, $cnt, #1 subs @t[2], $b_lo, $a_lo // |b_|-|a_| and @t[0], $b_lo, @t[6] sbc @t[3], $b_hi, $a_hi and @t[1], $b_hi, @t[6] subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) mov @t[0], $f0 sbcs @t[5], $a_hi, @t[1] mov @t[1], $g0 csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| csel $b_hi, $b_hi, $a_hi, hs csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel $a_hi, @t[5], @t[3], hs csel $f0, $f0, $f1, hs // exchange |f0| and |f1| csel $f1, $f1, @t[0], hs csel $g0, $g0, $g1, hs // exchange |g0| and |g1| csel $g1, $g1, @t[1], hs extr $a_lo, $a_hi, $a_lo, #1 lsr $a_hi, $a_hi, #1 and @t[0], $f1, @t[6] and @t[1], $g1, @t[6] add $f1, $f1, $f1 // |f1|<<=1 add $g1, $g1, $g1 // |g1|<<=1 sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) cbnz $cnt, .Loop_62 ret .size __inner_loop_62,.-__inner_loop_62 ___ print $code; close STDOUT; ================================================ FILE: src/asm/ct_is_square_mod_384-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast quadratic residue test as suggested in # https://eprint.iacr.org/2020/972. Performance is >12x better [on # Cortex cores] than modulus-specific Legendre symbol addition chain... # # bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); # $python_ref.=<<'___'; def ct_is_square_mod_384(inp, mod): a = inp b = mod L = 0 # only least significant bit, adding 1 makes up for sign change k = 30 w = 32 mask = (1 << w) - 1 for i in range(0, 768 // k - 1): # __ab_approximation_30 n = max(a.bit_length(), b.bit_length()) if n < 64: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-w)) << w) b_ = (b & mask) | ((b >> (n-w)) << w) # __inner_loop_30 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits # tell the whole story a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] # __smulq_384_n_shift_by_30 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if b < 0: b = -b if a < 0: a = -a L += (b % 4) >> 1 # |b| is always odd, the second bit # tells the whole story if True: for j in range(0, 768 % k + k): if a & 1: if a < b: a, b = b, a L += (a & b) >> 1 # |a| and |b| are both odd, second bits # tell the whole story a = a-b a = a >> 1 L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] return (L & 1) ^ 1 ___ $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2)); my @acc=map("x$_",(3..14)); my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20)); my @t = map("x$_",(21..28)); my ($a_, $b_) = @acc[5,11]; $frame = 2*256; $code.=<<___; .text .globl ct_is_square_mod_384 .hidden ct_is_square_mod_384 .type ct_is_square_mod_384, %function .align 5 ct_is_square_mod_384: paciasp stp c29, c30, [csp,#-16*__SIZEOF_POINTER__]! add c29, csp, #0 stp c19, c20, [csp,#2*__SIZEOF_POINTER__] stp c21, c22, [csp,#4*__SIZEOF_POINTER__] stp c23, c24, [csp,#6*__SIZEOF_POINTER__] stp c25, c26, [csp,#8*__SIZEOF_POINTER__] stp c27, c28, [csp,#10*__SIZEOF_POINTER__] sub csp, csp, #$frame ldp @acc[0], @acc[1], [x0,#8*0] // load input ldp @acc[2], @acc[3], [x0,#8*2] ldp @acc[4], @acc[5], [x0,#8*4] add $in_ptr, sp, #255 // find closest 256-byte-aligned spot and $in_ptr, $in_ptr, #-256 // in the frame... #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, csp, $in_ptr #endif ldp @acc[6], @acc[7], [x1,#8*0] // load modulus ldp @acc[8], @acc[9], [x1,#8*2] ldp @acc[10], @acc[11], [x1,#8*4] stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a| stp @acc[2], @acc[3], [$in_ptr,#8*8] stp @acc[4], @acc[5], [$in_ptr,#8*10] stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b| stp @acc[8], @acc[9], [$in_ptr,#8*2] stp @acc[10], @acc[11], [$in_ptr,#8*4] eor $L, $L, $L // init the Legendre symbol mov $cnt, #24 // 24 is 768/30-1 b .Loop_is_square .align 4 .Loop_is_square: bl __ab_approximation_30 sub $cnt, $cnt, #1 eor $out_ptr, $in_ptr, #128 // pointer to dst |b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $out_ptr, csp, $out_ptr #endif bl __smul_384_n_shift_by_30 mov $f1, $f0 // |f0| mov $g1, $g0 // |g0| cadd $out_ptr, $out_ptr, #8*6 // pointer to dst |a| bl __smul_384_n_shift_by_30 ldp @acc[6], @acc[7], [$out_ptr,#-8*6] eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b| #ifdef __CHERI_PURE_CAPABILITY__ scvalue $in_ptr, csp, $in_ptr #endif and @t[6], @t[6], @acc[6] // if |a| was negative, add $L, $L, @t[6], lsr#1 // adjust |L| cbnz $cnt, .Loop_is_square ////////////////////////////////////////// last iteration //bl __ab_approximation_30 // |a| and |b| are exact, //ldr $a_, [$in_ptr,#8*6] // and loaded //ldr $b_, [$in_ptr,#8*0] mov $cnt, #48 // 48 is 768%30 + 30 bl __inner_loop_48 ldr c30, [c29,#__SIZEOF_POINTER__] and x0, $L, #1 eor x0, x0, #1 add csp, csp, #$frame ldp c19, c20, [c29,#2*__SIZEOF_POINTER__] ldp c21, c22, [c29,#4*__SIZEOF_POINTER__] ldp c23, c24, [c29,#6*__SIZEOF_POINTER__] ldp c25, c26, [c29,#8*__SIZEOF_POINTER__] ldp c27, c28, [c29,#10*__SIZEOF_POINTER__] ldr c29, [csp],#16*__SIZEOF_POINTER__ autiasp ret .size ct_is_square_mod_384,.-ct_is_square_mod_384 .type __smul_384_n_shift_by_30, %function .align 5 __smul_384_n_shift_by_30: ___ for($j=0; $j<2; $j++) { my $fx = $g1; $fx = $f1 if ($j); my @acc = @acc; @acc = @acc[6..11] if ($j); my $k = 8*6*$j; $code.=<<___; ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|) asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s) ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|) ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|) sub $fx, $fx, @t[6] eor @acc[1], @acc[1], @t[6] adds @acc[0], @acc[0], @t[6], lsr#63 eor @acc[2], @acc[2], @t[6] adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], @t[6] adcs @acc[2], @acc[2], xzr eor @acc[4], @acc[4], @t[6] umulh @t[0], @acc[0], $fx adcs @acc[3], @acc[3], xzr umulh @t[1], @acc[1], $fx eor @acc[5], @acc[5], @t[6] umulh @t[2], @acc[2], $fx adcs @acc[4], @acc[4], xzr umulh @t[3], @acc[3], $fx adc @acc[5], @acc[5], xzr umulh @t[4], @acc[4], $fx and @t[7], $fx, @t[6] umulh @t[5+$j], @acc[5], $fx neg @t[7], @t[7] mul @acc[0], @acc[0], $fx mul @acc[1], @acc[1], $fx mul @acc[2], @acc[2], $fx adds @acc[1], @acc[1], @t[0] mul @acc[3], @acc[3], $fx adcs @acc[2], @acc[2], @t[1] mul @acc[4], @acc[4], $fx adcs @acc[3], @acc[3], @t[2] mul @acc[5], @acc[5], $fx adcs @acc[4], @acc[4], @t[3] adcs @acc[5], @acc[5] ,@t[4] adc @t[5+$j], @t[5+$j], @t[7] ___ } $code.=<<___; adds @acc[0], @acc[0], @acc[6] adcs @acc[1], @acc[1], @acc[7] adcs @acc[2], @acc[2], @acc[8] adcs @acc[3], @acc[3], @acc[9] adcs @acc[4], @acc[4], @acc[10] adcs @acc[5], @acc[5], @acc[11] adc @acc[6], @t[5], @t[6] extr @acc[0], @acc[1], @acc[0], #30 extr @acc[1], @acc[2], @acc[1], #30 extr @acc[2], @acc[3], @acc[2], #30 asr @t[6], @acc[6], #63 extr @acc[3], @acc[4], @acc[3], #30 extr @acc[4], @acc[5], @acc[4], #30 extr @acc[5], @acc[6], @acc[5], #30 eor @acc[0], @acc[0], @t[6] eor @acc[1], @acc[1], @t[6] adds @acc[0], @acc[0], @t[6], lsr#63 eor @acc[2], @acc[2], @t[6] adcs @acc[1], @acc[1], xzr eor @acc[3], @acc[3], @t[6] adcs @acc[2], @acc[2], xzr eor @acc[4], @acc[4], @t[6] adcs @acc[3], @acc[3], xzr eor @acc[5], @acc[5], @t[6] stp @acc[0], @acc[1], [$out_ptr,#8*0] adcs @acc[4], @acc[4], xzr stp @acc[2], @acc[3], [$out_ptr,#8*2] adc @acc[5], @acc[5], xzr stp @acc[4], @acc[5], [$out_ptr,#8*4] ret .size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 ___ { my @a = @acc[0..5]; my @b = @acc[6..11]; my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]); $code.=<<___; .type __ab_approximation_30, %function .align 4 __ab_approximation_30: ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers ldp @b[2], @b[3], [$in_ptr,#8*2] orr @t[0], @a[5], @b[5] // check top-most limbs, ... cmp @t[0], #0 csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne csel @a[4], @a[4], @a[3], ne orr @t[0], @a[5], @b[5] // ... ones before top-most, ... csel @b[4], @b[4], @b[3], ne cmp @t[0], #0 csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne csel @a[4], @a[4], @a[2], ne orr @t[0], @a[5], @b[5] // ... and ones before that ... csel @b[4], @b[4], @b[2], ne cmp @t[0], #0 csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne csel @a[4], @a[4], @a[1], ne orr @t[0], @a[5], @b[5] // and one more, ... csel @b[4], @b[4], @b[1], ne cmp @t[0], #0 csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne csel @a[4], @a[4], @a[0], ne orr @t[0], @a[5], @b[5] csel @b[4], @b[4], @b[0], ne clz @t[0], @t[0] cmp @t[0], #64 csel @t[0], @t[0], xzr, ne csel @a[5], @a[5], @a[4], ne csel @b[5], @b[5], @b[4], ne neg @t[1], @t[0] lslv @a[5], @a[5], @t[0] // align high limbs to the left lslv @b[5], @b[5], @t[0] lsrv @a[4], @a[4], @t[1] lsrv @b[4], @b[4], @t[1] and @a[4], @a[4], @t[1], asr#6 and @b[4], @b[4], @t[1], asr#6 orr $a_, @a[5], @a[4] orr $b_, @b[5], @b[4] bfxil $a_, @a[0], #0, #32 bfxil $b_, @b[0], #0, #32 b __inner_loop_30 ret .size __ab_approximation_30,.-__ab_approximation_30 .type __inner_loop_30, %function .align 4 __inner_loop_30: mov $cnt, #30 mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 mov $bias,#0x7FFFFFFF7FFFFFFF .Loop_30: sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting and @t[4], $a_, $b_ sub $cnt, $cnt, #1 and @t[0], $b_, @t[3] sub @t[1], $b_, $a_ // |b_|-|a_| subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1 mov @t[0], $fg1 csel $b_, $b_, $a_, hs // |b_| = |a_| csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| csel $fg0, $fg0, @t[0], hs csel $L, $L, @t[4], hs lsr $a_, $a_, #1 and @t[0], $fg1, @t[3] and @t[1], $bias, @t[3] add $t[2], $b_, #2 sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) add $fg1, $fg1, $fg1 // |f1|<<=1 add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 add $fg0, $fg0, @t[1] sub $fg1, $fg1, $bias cbnz $cnt, .Loop_30 mov $bias, #0x7FFFFFFF ubfx $f0, $fg0, #0, #32 ubfx $g0, $fg0, #32, #32 ubfx $f1, $fg1, #0, #32 ubfx $g1, $fg1, #32, #32 sub $f0, $f0, $bias // remove the bias sub $g0, $g0, $bias sub $f1, $f1, $bias sub $g1, $g1, $bias ret .size __inner_loop_30,.-__inner_loop_30 ___ } { my ($a_, $b_) = (@acc[0], @acc[6]); $code.=<<___; .type __inner_loop_48, %function .align 4 __inner_loop_48: .Loop_48: sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting and @t[4], $a_, $b_ sub $cnt, $cnt, #1 and @t[0], $b_, @t[3] sub @t[1], $b_, $a_ // |b_|-|a_| subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) add @t[4], $L, @t[4], lsr#1 csel $b_, $b_, $a_, hs // |b_| = |a_| csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| csel $L, $L, @t[4], hs add $t[2], $b_, #2 lsr $a_, $a_, #1 add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 cbnz $cnt, .Loop_48 ret .size __inner_loop_48,.-__inner_loop_48 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/ct_is_square_mod_384-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast quadratic residue test as suggested in # https://eprint.iacr.org/2020/972. Performance is >5x better than # modulus-specific Legendre symbol addition chain... # # bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); # $python_ref.=<<'___'; def ct_is_square_mod_384(inp, mod): a = inp b = mod L = 0 # only least significant bit, adding 1 makes up for sign change k = 30 w = 32 mask = (1 << w) - 1 for i in range(0, 768 // k - 1): # __ab_approximation_30 n = max(a.bit_length(), b.bit_length()) if n < 64: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-w)) << w) b_ = (b & mask) | ((b >> (n-w)) << w) # __inner_loop_30 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits # tell the whole story a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] # __smulq_384_n_shift_by_30 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if b < 0: b = -b if a < 0: a = -a L += (b % 4) >> 1 # |b| is always odd, the second bit # tells the whole story if True: for j in range(0, 768 % k + k): if a & 1: if a < b: a, b = b, a L += (a & b) >> 1 # |a| and |b| are both odd, second bits # tell the whole story a = a-b a = a >> 1 L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] return (L & 1) ^ 1 ___ $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; my ($out_ptr, $in_ptr) = ("%rdi", "%rsi"); my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx"); my @acc=map("%r$_",(8..15)); my $L = "%rbp"; $frame = 8*3+2*256; $code.=<<___; .text .globl ct_is_square_mod_384 .hidden ct_is_square_mod_384 .type ct_is_square_mod_384,\@function,2,"unwind" .align 32 ct_is_square_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot and \$-256, %rax # in the frame... #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0(%rdi), @acc[0] # load input mov 8*1(%rdi), @acc[1] mov 8*2(%rdi), @acc[2] mov 8*3(%rdi), @acc[3] mov 8*4(%rdi), @acc[4] mov 8*5(%rdi), @acc[5] mov 8*0(%rsi), @acc[6] # load modulus mov 8*1(%rsi), @acc[7] mov 8*2(%rsi), %rbx mov 8*3(%rsi), %rcx mov 8*4(%rsi), %rdx mov 8*5(%rsi), %rdi mov %rax, $in_ptr # pointer to source |a|b| mov @acc[0], 8*0(%rax) # copy input to |a| mov @acc[1], 8*1(%rax) mov @acc[2], 8*2(%rax) mov @acc[3], 8*3(%rax) mov @acc[4], 8*4(%rax) mov @acc[5], 8*5(%rax) mov @acc[6], 8*6(%rax) # copy modulus to |b| mov @acc[7], 8*7(%rax) mov %rbx, 8*8(%rax) mov %rcx, 8*9(%rax) mov %rdx, 8*10(%rax) mov %rdi, 8*11(%rax) xor $L, $L # initialize the Legendre symbol mov \$24, %ecx # 24 is 768/30-1 jmp .Loop_is_square .align 32 .Loop_is_square: mov %ecx, 8*2(%rsp) # offload loop counter call __ab_approximation_30 mov $f0, 8*0(%rsp) # offload |f0| and |g0| mov $g0, 8*1(%rsp) mov \$128+8*6, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |b| call __smulq_384_n_shift_by_30 mov 8*0(%rsp), $f1 # pop |f0| and |g0| mov 8*1(%rsp), $g1 lea -8*6($out_ptr),$out_ptr # pointer to destination |a| call __smulq_384_n_shift_by_30 mov 8*2(%rsp), %ecx # re-load loop counter xor \$128, $in_ptr # flip-flop pointer to source |a|b| and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L| shr \$1, @acc[6] add @acc[6], $L sub \$1, %ecx jnz .Loop_is_square ################################# last iteration #call __ab_approximation_30 # |a| and |b| are exact, just load #mov 8*0($in_ptr), @acc[0] # |a_| mov 8*6($in_ptr), @acc[1] # |b_| call __inner_loop_48 # 48 is 768%30+30 mov \$1, %rax and $L, %rax xor \$1, %rax # return value lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size ct_is_square_mod_384,.-ct_is_square_mod_384 .type __smulq_384_n_shift_by_30,\@abi-omnipotent .align 32 __smulq_384_n_shift_by_30: ___ for($j=0; $j<2; $j++) { $code.=<<___; mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov %rdx, %rbx # |f1| (or |g1|) sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s) xor %rax, %rax sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s) xor %rdx, %rbx # conditionally negate |f1| (or |g1|) add %rax, %rbx xor %rdx, @acc[0] # conditionally negate |a| (or |b|) xor %rdx, @acc[1] xor %rdx, @acc[2] xor %rdx, @acc[3] xor %rdx, @acc[4] xor %rdx, @acc[5] add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mov %rdx, @acc[6+$j] and %rbx, @acc[6+$j] mulq %rbx # |a|*|f1| (or |b|*|g1|) mov %rax, @acc[0] mov @acc[1], %rax mov %rdx, @acc[1] ___ for($i=1; $i<5; $i++) { $code.=<<___; mulq %rbx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___; neg @acc[6+$j] mulq %rbx add %rax, @acc[5] adc %rdx, @acc[6+$j] ___ $code.=<<___ if ($j==0); lea 8*6($in_ptr), $in_ptr # pointer to |b| mov $g1, %rdx mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) ___ } $code.=<<___; lea -8*6($in_ptr), $in_ptr # restore original in_ptr add 8*0($out_ptr), @acc[0] adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc 8*4($out_ptr), @acc[4] adc 8*5($out_ptr), @acc[5] adc @acc[7], @acc[6] shrd \$30, @acc[1], @acc[0] shrd \$30, @acc[2], @acc[1] shrd \$30, @acc[3], @acc[2] shrd \$30, @acc[4], @acc[3] shrd \$30, @acc[5], @acc[4] shrd \$30, @acc[6], @acc[5] sar \$63, @acc[6] # sign as mask xor %rbx, %rbx sub @acc[6], %rbx # sign as bit xor @acc[6], @acc[0] # conditionally negate the result xor @acc[6], @acc[1] xor @acc[6], @acc[2] xor @acc[6], @acc[3] xor @acc[6], @acc[4] xor @acc[6], @acc[5] add %rbx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) ret .size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 ___ { my ($a_, $b_) = @acc[0..1]; my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15)); my ($fg0, $fg1, $bias) = ($g0, $g1, $t5); my $cnt = "%edi"; { my @a = @acc[0..5]; my @b = (@a[1..3], $t4, $t5, $g0); $code.=<<___; .type __ab_approximation_30,\@abi-omnipotent .align 32 __ab_approximation_30: mov 8*11($in_ptr), @b[5] # load |b| in reverse order mov 8*10($in_ptr), @b[4] mov 8*9($in_ptr), @b[3] mov @a[5], %rax or @b[5], %rax # check top-most limbs, ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[3], @a[4] mov 8*8($in_ptr), @b[2] cmovz @b[3], @b[4] mov @a[5], %rax or @b[5], %rax # ... ones before top-most, ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[2], @a[4] mov 8*7($in_ptr), @b[1] cmovz @b[2], @b[4] mov @a[5], %rax or @b[5], %rax # ... and ones before that ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[1], @a[4] mov 8*6($in_ptr), @b[0] cmovz @b[1], @b[4] mov @a[5], %rax or @b[5], %rax # ... and ones before that ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[0], @a[4] cmovz @b[0], @b[4] mov @a[5], %rax or @b[5], %rax bsr %rax, %rcx lea 1(%rcx), %rcx cmovz @a[0], @a[5] cmovz @b[0], @b[5] cmovz %rax, %rcx neg %rcx #and \$63, %rcx # debugging artefact shldq %cl, @a[4], @a[5] # align second limb to the left shldq %cl, @b[4], @b[5] mov \$0xFFFFFFFF00000000, %rax mov @a[0]d, ${a_}d mov @b[0]d, ${b_}d and %rax, @a[5] and %rax, @b[5] or @a[5], ${a_} or @b[5], ${b_} jmp __inner_loop_30 ret .size __ab_approximation_30,.-__ab_approximation_30 ___ } $code.=<<___; .type __inner_loop_30,\@abi-omnipotent .align 32 __inner_loop_30: ################# by Thomas Pornin mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF mov \$30, $cnt .Loop_30: mov $a_, %rax and $b_, %rax shr \$1, %rax # (a_ & b_) >> 1 cmp $b_, $a_ # if |a_|<|b_|, swap the variables mov $a_, $t0 mov $b_, $t1 lea (%rax,$L), %rax # pre-"negate" |L| mov $fg0, $t2 mov $fg1, $t3 mov $L, $t4 cmovb $b_, $a_ cmovb $t0, $b_ cmovb $fg1, $fg0 cmovb $t2, $fg1 cmovb %rax, $L sub $b_, $a_ # |a_|-|b_| sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| add $bias, $fg0 test \$1, $t0 # if |a_| was even, roll back cmovz $t0, $a_ cmovz $t1, $b_ cmovz $t2, $fg0 cmovz $t3, $fg1 cmovz $t4, $L lea 2($b_), %rax shr \$1, $a_ # |a_|>>=1 shr \$2, %rax add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5 sub $bias, $fg1 sub \$1, $cnt jnz .Loop_30 shr \$32, $bias mov %ebx, %eax # $fg0 -> $f0 shr \$32, $g0 mov %ecx, %edx # $fg1 -> $f1 shr \$32, $g1 sub $bias, $f0 # remove the bias sub $bias, $g0 sub $bias, $f1 sub $bias, $g1 ret # __SGX_LVI_HARDENING_CLOBBER__=$a_ .size __inner_loop_30,.-__inner_loop_30 .type __inner_loop_48,\@abi-omnipotent .align 32 __inner_loop_48: mov \$48, $cnt # 48 is 768%30+30 .Loop_48: mov $a_, %rax and $b_, %rax shr \$1, %rax # (a_ & b_) >> 1 cmp $b_, $a_ # if |a_|<|b_|, swap the variables mov $a_, $t0 mov $b_, $t1 lea (%rax,$L), %rax mov $L, $t2 cmovb $b_, $a_ cmovb $t0, $b_ cmovb %rax, $L sub $b_, $a_ # |a_|-|b_| test \$1, $t0 # if |a_| was even, roll back cmovz $t0, $a_ cmovz $t1, $b_ cmovz $t2, $L lea 2($b_), %rax shr \$1, $a_ # |a_|>>=1 shr \$2, %rax add %rax, $L # "negate" |L| if |b|%8 is 3 or 5 sub \$1, $cnt jnz .Loop_48 ret .size __inner_loop_48,.-__inner_loop_48 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/ctq_inverse_mod_384-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast Euclidean inversion as suggested in # https://eprint.iacr.org/2020/972. Performance is >5x better than # modulus-specific FLT addition chain... # # void ct_inverse_mod_384(vec768 ret, const vec384 inp, const vec384 mod); # $python_ref.=<<'___'; def ct_inverse_mod_384(inp, mod): a, u = inp, 1 b, v = mod, 0 k = 62 w = 64 mask = (1 << w) - 1 for i in range(0, 768 // k): # __ab_approximation_62 n = max(a.bit_length(), b.bit_length()) if n < 128: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-w)) << w) b_ = (b & mask) | ((b >> (n-w)) << w) # __inner_loop_62 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 # __smulq_384_n_shift_by_62 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if a < 0: a, f0, g0 = -a, -f0, -g0 if b < 0: b, f1, g1 = -b, -f1, -g1 # __smulq_768x63 u, v = u*f0 + v*g0, u*f1 + v*g1 if 768 % k: f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, 768 % k): if a & 1: if a < b: a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 a, f0, g0 = a-b, f0-f1, g0-g1 a, f1, g1 = a >> 1, f1 << 1, g1 << 1 v = u*f1 + v*g1 mod <<= 768 - mod.bit_length() # align to the left if v < 0: v += mod if v < 0: v += mod elif v == 1<<768: v -= mod return v & (2**768 - 1) # to be reduced % mod ___ $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; $code.=<<___ if ($flavour =~ /masm/); .extern ct_inverse_mod_384\$1 ___ my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); my $cnt = "%edi"; $frame = 8*11+2*512; $code.=<<___; .comm __blst_platform_cap,4 .text .globl ct_inverse_mod_384 .hidden ct_inverse_mod_384 .type ct_inverse_mod_384,\@function,4,"unwind" .align 32 ct_inverse_mod_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz ct_inverse_mod_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot and \$-512, %rax # in the frame... mov $out_ptr, 8*4(%rsp) mov $nx_ptr, 8*5(%rsp) mov 8*0($in_ptr), @acc[0] # load input mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov 8*0($n_ptr), @acc[6] # load modulus mov 8*1($n_ptr), @acc[7] mov 8*2($n_ptr), @acc[8] mov 8*3($n_ptr), @acc[9] mov 8*4($n_ptr), @acc[10] mov 8*5($n_ptr), @acc[11] mov @acc[0], 8*0(%rax) # copy input to |a| mov @acc[1], 8*1(%rax) mov @acc[2], 8*2(%rax) mov @acc[3], 8*3(%rax) mov @acc[4], 8*4(%rax) mov @acc[5], 8*5(%rax) mov @acc[6], 8*6(%rax) # copy modulus to |b| mov @acc[7], 8*7(%rax) mov @acc[8], 8*8(%rax) mov @acc[9], 8*9(%rax) mov @acc[10], 8*10(%rax) mov %rax, $in_ptr # pointer to source |a|b|1|0| mov @acc[11], 8*11(%rax) ################################# first iteration mov \$62, $cnt call __ab_approximation_62 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) mov $f1, 8*9(%rsp) mov $g1, 8*10(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulq_384_n_shift_by_62 #mov $f0, 8*7(%rsp) # corrected |f0| #mov $g0, 8*8(%rsp) # corrected |g0| mov $f0, 8*12($out_ptr) # initialize |u| with |f0| mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*6($out_ptr), $out_ptr # pointer to destination |b| call __smulq_384_n_shift_by_62 #mov $f0, 8*9(%rsp) # corrected |f1| #mov $g0, 8*10(%rsp) # corrected |g1| mov $f0, 8*13($out_ptr) # initialize |v| with |f1| ################################# second iteration xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$62, $cnt call __ab_approximation_62 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) mov $f1, 8*9(%rsp) mov $g1, 8*10(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulq_384_n_shift_by_62 mov $f0, 8*7(%rsp) # corrected |f0| mov $g0, 8*8(%rsp) # corrected |g0| mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*6($out_ptr), $out_ptr # pointer to destination |b| call __smulq_384_n_shift_by_62 #mov $f0, 8*9(%rsp) # corrected |f1| #mov $g0, 8*10(%rsp) # corrected |g1| mov 8*12($in_ptr), %rax # |u| mov 8*19($in_ptr), @acc[3] # |v| mov $f0, %rbx mov %rax, @acc[2] imulq 8*7(%rsp) # |u|*|f0| mov %rax, @acc[0] mov @acc[3], %rax mov %rdx, @acc[1] imulq 8*8(%rsp) # |v|*|g0| add %rax, @acc[0] adc %rdx, @acc[1] mov @acc[0], 8*6($out_ptr) # destination |u| mov @acc[1], 8*7($out_ptr) sar \$63, @acc[1] # sign extension mov @acc[1], 8*8($out_ptr) mov @acc[1], 8*9($out_ptr) mov @acc[1], 8*10($out_ptr) mov @acc[1], 8*11($out_ptr) mov @acc[1], 8*12($out_ptr) lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor mov @acc[2], %rax imulq %rbx # |u|*|f1| mov %rax, @acc[0] mov @acc[3], %rax mov %rdx, @acc[1] imulq %rcx # |v|*|g1| add %rax, @acc[0] adc %rdx, @acc[1] mov @acc[0], 8*13($out_ptr) # destination |v| mov @acc[1], 8*14($out_ptr) sar \$63, @acc[1] # sign extension mov @acc[1], 8*15($out_ptr) mov @acc[1], 8*16($out_ptr) mov @acc[1], 8*17($out_ptr) mov @acc[1], 8*18($out_ptr) mov @acc[1], 8*19($out_ptr) ___ for($i=2; $i<11; $i++) { my $smul_768x63 = $i>5 ? "__smulq_768x63" : "__smulq_384x63"; $code.=<<___; xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$62, $cnt call __ab_approximation_62 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) mov $f1, 8*9(%rsp) mov $g1, 8*10(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulq_384_n_shift_by_62 mov $f0, 8*7(%rsp) # corrected |f0| mov $g0, 8*8(%rsp) # corrected |g0| mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*6($out_ptr), $out_ptr # pointer to destination |b| call __smulq_384_n_shift_by_62 mov $f0, 8*9(%rsp) # corrected |f1| mov $g0, 8*10(%rsp) # corrected |g1| mov 8*7(%rsp), $f0 # |f0| mov 8*8(%rsp), $g0 # |g0| lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| lea 8*6($out_ptr), $out_ptr # pointer to destination |u| call __smulq_384x63 mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*7($out_ptr),$out_ptr # pointer to destination |v| call $smul_768x63 ___ $code.=<<___ if ($i==5); mov @acc[6], 8*7($out_ptr) # sign extension mov @acc[6], 8*8($out_ptr) mov @acc[6], 8*9($out_ptr) mov @acc[6], 8*10($out_ptr) mov @acc[6], 8*11($out_ptr) ___ } $code.=<<___; ################################# iteration before last xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$62, $cnt #call __ab_approximation_62 # |a| and |b| are exact, just load mov 8*0($in_ptr), @acc[0] # |a_lo| mov 8*1($in_ptr), @acc[1] # |a_hi| mov 8*6($in_ptr), @acc[2] # |b_lo| mov 8*7($in_ptr), @acc[3] # |b_hi| call __inner_loop_62 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) mov $f1, 8*9(%rsp) mov $g1, 8*10(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| mov @acc[0], 8*0($out_ptr) mov @acc[2], 8*6($out_ptr) #mov 8*7(%rsp), $f0 # |f0| #mov 8*8(%rsp), $g0 # |g0| lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| lea 8*12($out_ptr),$out_ptr # pointer to destination |u| call __smulq_384x63 mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*7($out_ptr),$out_ptr # pointer to destination |v| call __smulq_768x63 ################################# last iteration xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$24, $cnt # 768 % 62 #call __ab_approximation_62 # |a| and |b| are exact, just load mov 8*0($in_ptr), @acc[0] # |a_lo| xor @acc[1], @acc[1] # |a_hi| mov 8*6($in_ptr), @acc[2] # |b_lo| xor @acc[3], @acc[3] # |b_hi| call __inner_loop_62 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) #mov $f1, 8*9(%rsp) #mov $g1, 8*10(%rsp) #mov 8*7(%rsp), $f0 # |f0| #mov 8*8(%rsp), $g0 # |g0| lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| #call __smulq_384x63 #mov 8*9(%rsp), $f0 # |f1| #mov 8*10(%rsp), $g0 # |g1| mov $f1, $f0 mov $g1, $g0 mov 8*4(%rsp), $out_ptr # original out_ptr call __smulq_768x63 mov 8*5(%rsp), $in_ptr # original n_ptr mov %rdx, @acc[5] # the excess limb, -1, 0, or 1 sar \$63, @acc[5] # result's sign as mask mov @acc[5], @acc[0] # mask |modulus| mov @acc[5], @acc[1] mov @acc[5], @acc[2] and 8*0($in_ptr), @acc[0] and 8*1($in_ptr), @acc[1] mov @acc[5], @acc[3] and 8*2($in_ptr), @acc[2] and 8*3($in_ptr), @acc[3] mov @acc[5], @acc[4] and 8*4($in_ptr), @acc[4] and 8*5($in_ptr), @acc[5] add @acc[0], @acc[6] # conditionally add |modulus|<<384 adc @acc[1], @acc[7] adc @acc[2], @acc[8] adc @acc[3], @acc[9] adc @acc[4], %rcx adc @acc[5], %rax adc \$0, %rdx mov %rdx, @acc[5] neg %rdx or %rdx, @acc[5] # excess bit or sign as mask sar \$63, %rdx # excess bit as mask mov @acc[5], @acc[0] # mask |modulus| mov @acc[5], @acc[1] mov @acc[5], @acc[2] and 8*0($in_ptr), @acc[0] and 8*1($in_ptr), @acc[1] mov @acc[5], @acc[3] and 8*2($in_ptr), @acc[2] and 8*3($in_ptr), @acc[3] mov @acc[5], @acc[4] and 8*4($in_ptr), @acc[4] and 8*5($in_ptr), @acc[5] xor %rdx, @acc[0] # conditionally negate |modulus| xor $in_ptr, $in_ptr xor %rdx, @acc[1] sub %rdx, $in_ptr xor %rdx, @acc[2] xor %rdx, @acc[3] xor %rdx, @acc[4] xor %rdx, @acc[5] add $in_ptr, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] add @acc[0], @acc[6] # final adjustment for |modulus|<<384 adc @acc[1], @acc[7] adc @acc[2], @acc[8] adc @acc[3], @acc[9] adc @acc[4], %rcx adc @acc[5], %rax mov @acc[6], 8*6($out_ptr) # store absolute value mov @acc[7], 8*7($out_ptr) mov @acc[8], 8*8($out_ptr) mov @acc[9], 8*9($out_ptr) mov %rcx, 8*10($out_ptr) mov %rax, 8*11($out_ptr) lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size ct_inverse_mod_384,.-ct_inverse_mod_384 ___ ######################################################################## # see corresponding commentary in ctx_inverse_mod_384-x86_64... { my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); my $fx = @acc[9]; $code.=<<___; .type __smulq_768x63,\@abi-omnipotent .align 32 __smulq_768x63: mov 8*0($in_ptr), @acc[0] # load |u| mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov 8*6($in_ptr), @acc[6] # sign limb mov $f0, $fx sar \$63, $f0 # |f0|'s sign as mask xor %rax, %rax sub $f0, %rax # |f0|'s sign as bit mov $out_ptr, 8*1(%rsp) mov $in_ptr, 8*2(%rsp) lea 8*7($in_ptr), $in_ptr # pointer to |v| xor $f0, $fx # conditionally negate |f0| add %rax, $fx xor $f0, @acc[0] # conditionally negate |u| xor $f0, @acc[1] xor $f0, @acc[2] xor $f0, @acc[3] xor $f0, @acc[4] xor $f0, @acc[5] xor $f0, @acc[6] add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] mulq $fx # |u|*|f0| mov %rax, 8*0($out_ptr) # offload |u|*|f0| mov @acc[1], %rax and $fx, @acc[6] neg @acc[6] mov %rdx, @acc[1] ___ for($i=1; $i<5; $i++) { $code.=<<___; mulq $fx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] mov @acc[$i], 8*$i($out_ptr) ___ } $code.=<<___; mulq $fx add %rax, @acc[$i] adc %rdx, @acc[6] mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) sar \$63, @acc[6] # sign extension mov @acc[6], 8*7($out_ptr) ___ { my $fx=$in_ptr; $code.=<<___; mov $g0, $f0 # load |g0| mov 8*0($in_ptr), @acc[0] # load |v| mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov 8*6($in_ptr), @acc[6] mov 8*7($in_ptr), @acc[7] mov 8*8($in_ptr), @acc[8] mov 8*9($in_ptr), @acc[9] mov 8*10($in_ptr), @acc[10] mov 8*11($in_ptr), @acc[11] mov $f0, $fx # overrides in_ptr sar \$63, $f0 # |g0|'s sign as mask xor %rax, %rax sub $f0, %rax # |g0|'s sign as bit xor $f0, $fx # conditionally negate |g0| add %rax, $fx xor $f0, @acc[0] # conditionally negate |v| xor $f0, @acc[1] xor $f0, @acc[2] xor $f0, @acc[3] xor $f0, @acc[4] xor $f0, @acc[5] xor $f0, @acc[6] xor $f0, @acc[7] xor $f0, @acc[8] xor $f0, @acc[9] xor $f0, @acc[10] xor $f0, @acc[11] add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] adc \$0, @acc[7] adc \$0, @acc[8] adc \$0, @acc[9] adc \$0, @acc[10] adc \$0, @acc[11] mulq $fx # |v|*|g0| mov %rax, @acc[0] mov @acc[1], %rax mov %rdx, @acc[1] ___ for($i=1; $i<11; $i++) { $code.=<<___; mulq $fx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___; imulq $fx mov 8*1(%rsp), $in_ptr # borrow for out_ptr add @acc[11], %rax adc \$0, %rdx # used in the final step add 8*0($in_ptr), @acc[0] # accumulate |u|*|f0| adc 8*1($in_ptr), @acc[1] adc 8*2($in_ptr), @acc[2] adc 8*3($in_ptr), @acc[3] adc 8*4($in_ptr), @acc[4] adc 8*5($in_ptr), @acc[5] adc 8*6($in_ptr), @acc[6] mov 8*7($in_ptr), @acc[11] # sign extension adc @acc[11], @acc[7] adc @acc[11], @acc[8] adc @acc[11], @acc[9] adc @acc[11], @acc[10] adc @acc[11], %rax adc @acc[11], %rdx lea ($in_ptr), $out_ptr # restore original out_ptr mov 8*2(%rsp), $in_ptr # restore original in_ptr mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) mov @acc[7], 8*7($out_ptr) mov @acc[8], 8*8($out_ptr) mov @acc[9], 8*9($out_ptr) mov @acc[10], 8*10($out_ptr) mov %rax, 8*11($out_ptr) ret .size __smulq_768x63,.-__smulq_768x63 ___ } $code.=<<___; .type __smulq_384x63,\@abi-omnipotent .align 32 __smulq_384x63: ___ for($j=0; $j<2; $j++) { $code.=<<___; mov 8*0($in_ptr), @acc[0] # load |u| (or |v|) mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov 8*6($in_ptr), @acc[6] # sign/excess limb mov %rdx, $fx sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) xor %rax, %rax sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) xor %rdx, $fx # conditionally negate |f0| add %rax, $fx xor %rdx, @acc[0] # conditionally negate |u| (or |v|) xor %rdx, @acc[1] xor %rdx, @acc[2] xor %rdx, @acc[3] xor %rdx, @acc[4] xor %rdx, @acc[5] xor %rdx, @acc[6] add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] mulq $fx # |u|*|f0| (or |v|*|g0|) mov %rax, @acc[0] mov @acc[1], %rax and $fx, @acc[6] neg @acc[6] mov %rdx, @acc[1] ___ for($i=1; $i<5; $i++) { $code.=<<___; mulq $fx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___ if ($j==0); mulq $fx add %rax, @acc[5] adc %rdx, @acc[6] lea 8*7($in_ptr), $in_ptr # pointer to |v| mov $g0, %rdx mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], @acc[7] mov @acc[6], @acc[8] ___ } $code.=<<___; mulq $fx add %rax, @acc[5] adc %rdx, @acc[6] lea -8*7($in_ptr), $in_ptr # restore original in_ptr add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc 8*4($out_ptr), @acc[4] adc @acc[7], @acc[5] adc @acc[8], @acc[6] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) ret .size __smulq_384x63,.-__smulq_384x63 ___ { $code.=<<___; .type __smulq_384_n_shift_by_62,\@abi-omnipotent .align 32 __smulq_384_n_shift_by_62: mov $f0, @acc[8] ___ my $f0 = @acc[8]; for($j=0; $j<2; $j++) { my $top = $j==0 ? @acc[6] : @acc[7]; $code.=<<___; mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov %rdx, $fx sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) xor %rax, %rax sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) xor %rdx, $fx # conditionally negate |f0| (or |g0|) add %rax, $fx xor %rdx, @acc[0] # conditionally negate |a| (or |b|) xor %rdx, @acc[1] xor %rdx, @acc[2] xor %rdx, @acc[3] xor %rdx, @acc[4] xor %rdx, @acc[5] mov %rdx, $top add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mulq $fx # |a|*|f0| (or |b|*|g0|) mov %rax, @acc[0] mov @acc[1], %rax and $fx, $top neg $top mov %rdx, @acc[1] ___ for($i=1; $i<5; $i++) { $code.=<<___; mulq $fx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___ if ($j==0); mulq $fx add %rax, @acc[$i] adc %rdx, $top lea 8*6($in_ptr), $in_ptr # pointer to |b| mov $g0, %rdx mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) ___ } $code.=<<___; mulq $fx add %rax, @acc[$i] adc %rdx, @acc[7] lea -8*6($in_ptr), $in_ptr # restore original in_ptr mov $f0, %rdx add 8*0($out_ptr), @acc[0] adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc 8*4($out_ptr), @acc[4] adc 8*5($out_ptr), @acc[5] adc @acc[7], @acc[6] shrd \$62, @acc[1], @acc[0] shrd \$62, @acc[2], @acc[1] shrd \$62, @acc[3], @acc[2] shrd \$62, @acc[4], @acc[3] shrd \$62, @acc[5], @acc[4] shrd \$62, @acc[6], @acc[5] sar \$63, @acc[6] # sign as mask xor $fx, $fx sub @acc[6], $fx # sign as bit xor @acc[6], @acc[0] # conditionally negate the result xor @acc[6], @acc[1] xor @acc[6], @acc[2] xor @acc[6], @acc[3] xor @acc[6], @acc[4] xor @acc[6], @acc[5] add $fx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) xor @acc[6], %rdx # conditionally negate |f0| xor @acc[6], $g0 # conditionally negate |g0| add $fx, %rdx add $fx, $g0 ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] .size __smulq_384_n_shift_by_62,.-__smulq_384_n_shift_by_62 ___ } } { my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi"); { my @a = ($a_lo, $t1, $a_hi); my @b = ($b_lo, $t2, $b_hi); $code.=<<___; .type __ab_approximation_62,\@abi-omnipotent .align 32 __ab_approximation_62: mov 8*5($in_ptr), @a[2] # load |a| in reverse order mov 8*11($in_ptr), @b[2] # load |b| in reverse order mov 8*4($in_ptr), @a[1] mov 8*10($in_ptr), @b[1] mov 8*3($in_ptr), @a[0] mov 8*9($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # check top-most limbs, ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] cmovz @b[0], @b[1] mov 8*2($in_ptr), @a[0] mov 8*8($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # ... ones before top-most, ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] cmovz @b[0], @b[1] mov 8*1($in_ptr), @a[0] mov 8*7($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # ... and ones before that ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] cmovz @b[0], @b[1] mov 8*0($in_ptr), @a[0] mov 8*6($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 bsr $t0, %rcx lea 1(%rcx), %rcx cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz $t0, %rcx neg %rcx #and \$63, %rcx # debugging artefact shldq %cl, @a[1], @a[2] # align second limb to the left shldq %cl, @b[1], @b[2] jmp __inner_loop_62 ret .size __ab_approximation_62,.-__ab_approximation_62 ___ } $code.=<<___; .type __inner_loop_62,\@abi-omnipotent .align 8 .long 0 __inner_loop_62: mov \$1, $f0 # |f0|=1 xor $g0, $g0 # |g0|=0 xor $f1, $f1 # |f1|=0 mov \$1, $g1 # |g1|=1 mov $in_ptr, 8(%rsp) .Loop_62: xor $t0, $t0 xor $t1, $t1 test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| mov $b_lo, $t2 mov $b_hi, $t3 cmovnz $b_lo, $t0 cmovnz $b_hi, $t1 sub $a_lo, $t2 # |b_|-|a_| sbb $a_hi, $t3 mov $a_lo, $t4 mov $a_hi, $t5 sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) sbb $t1, $a_hi cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| cmovc $t3, $a_hi cmovc $t4, $b_lo # |b_| = |a_| cmovc $t5, $b_hi mov $f0, $t0 # exchange |f0| and |f1| cmovc $f1, $f0 cmovc $t0, $f1 mov $g0, $t1 # exchange |g0| and |g1| cmovc $g1, $g0 cmovc $t1, $g1 xor $t0, $t0 xor $t1, $t1 shrd \$1, $a_hi, $a_lo shr \$1, $a_hi test \$1, $t4 # if |a_| was odd, then we'll be subtracting... cmovnz $f1, $t0 cmovnz $g1, $t1 add $f1, $f1 # |f1|<<=1 add $g1, $g1 # |g1|<<=1 sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) sub \$1, $cnt jnz .Loop_62 mov 8(%rsp), $in_ptr ret # __SGX_LVI_HARDENING_CLOBBER__=$t0 .size __inner_loop_62,.-__inner_loop_62 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/ctx_inverse_mod_384-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast Euclidean inversion as suggested in # https://eprint.iacr.org/2020/972. Performance is >4x better than # modulus-specific FLT addition chain... # # void ct_inverse_mod_384(vec768 ret, const vec384 inp, const vec384 mod); # $python_ref.=<<'___'; def ct_inverse_mod_384(inp, mod): a, u = inp, 1 b, v = mod, 0 k = 31 mask = (1 << k) - 1 for i in range(0, 768 // k): # __ab_approximation_31 n = max(a.bit_length(), b.bit_length()) if n < 64: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-k-2)) << k) b_ = (b & mask) | ((b >> (n-k-2)) << k) # __inner_loop_31 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 # __smulx_384_n_shift_by_31 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if a < 0: a, f0, g0 = -a, -f0, -g0 if b < 0: b, f1, g1 = -b, -f1, -g1 # __smulx_768x63 u, v = u*f0 + v*g0, u*f1 + v*g1 if 768 % k: f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, 768 % k): if a & 1: if a < b: a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 a, f0, g0 = a-b, f0-f1, g0-g1 a, f1, g1 = a >> 1, f1 << 1, g1 << 1 v = u*f1 + v*g1 mod <<= 768 - mod.bit_length() # align to the left if v < 0: v += mod if v < 0: v += mod elif v == 1<<768: v -= mod return v & (2**768 - 1) # to be reduced % mod ___ $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; $code.=<<___ if ($flavour =~ /masm/); .globl ct_inverse_mod_384\$1 ___ my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); my $cnt = "%edi"; $frame = 8*11+2*512; $code.=<<___; .text .globl ctx_inverse_mod_384 .hidden ctx_inverse_mod_384 .type ctx_inverse_mod_384,\@function,4,"unwind" .align 32 ctx_inverse_mod_384: .cfi_startproc ct_inverse_mod_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot and \$-512, %rax # in the frame... mov $out_ptr, 8*4(%rsp) mov $nx_ptr, 8*5(%rsp) #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($in_ptr), @acc[0] # load input mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov 8*0($n_ptr), @acc[6] # load modulus mov 8*1($n_ptr), @acc[7] mov 8*2($n_ptr), @acc[8] mov 8*3($n_ptr), @acc[9] mov 8*4($n_ptr), @acc[10] mov 8*5($n_ptr), @acc[11] mov @acc[0], 8*0(%rax) # copy input to |a| mov @acc[1], 8*1(%rax) mov @acc[2], 8*2(%rax) mov @acc[3], 8*3(%rax) mov @acc[4], 8*4(%rax) mov @acc[5], 8*5(%rax) mov @acc[6], 8*6(%rax) # copy modulus to |b| mov @acc[7], 8*7(%rax) mov @acc[8], 8*8(%rax) mov @acc[9], 8*9(%rax) mov @acc[10], 8*10(%rax) mov %rax, $in_ptr mov @acc[11], 8*11(%rax) ################################# first iteration mov \$31, $cnt call __ab_approximation_31 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) mov $f1, 8*9(%rsp) mov $g1, 8*10(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulx_384_n_shift_by_31 #mov $f0, 8*7(%rsp) # corrected |f0| #mov $g0, 8*8(%rsp) # corrected |g0| mov $f0, 8*12($out_ptr) # initialize |u| with |f0| mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*6($out_ptr), $out_ptr # pointer to destination |b| call __smulx_384_n_shift_by_31 #mov $f0, 8*9(%rsp) # corrected |f1| #mov $g0, 8*10(%rsp) # corrected |g1| mov $f0, 8*13($out_ptr) # initialize |v| with |f1| ################################# second iteration xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$31, $cnt call __ab_approximation_31 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) mov $f1, 8*9(%rsp) mov $g1, 8*10(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call __smulx_384_n_shift_by_31 mov $f0, 8*7(%rsp) # corrected |f0| mov $g0, 8*8(%rsp) # corrected |g0| mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*6($out_ptr), $out_ptr # pointer to destination |b| call __smulx_384_n_shift_by_31 #mov $f0, 8*9(%rsp) # corrected |f1| #mov $g0, 8*10(%rsp) # corrected |g1| mov 8*12($in_ptr), %rax # |u| mov 8*19($in_ptr), @acc[3] # |v| mov $f0, %rbx mov %rax, @acc[2] imulq 8*7(%rsp) # |u|*|f0| mov %rax, @acc[0] mov @acc[3], %rax mov %rdx, @acc[1] imulq 8*8(%rsp) # |v|*|g0| add %rax, @acc[0] adc %rdx, @acc[1] mov @acc[0], 8*6($out_ptr) # destination |u| mov @acc[1], 8*7($out_ptr) sar \$63, @acc[1] # sign extension mov @acc[1], 8*8($out_ptr) mov @acc[1], 8*9($out_ptr) mov @acc[1], 8*10($out_ptr) mov @acc[1], 8*11($out_ptr) mov @acc[1], 8*12($out_ptr) lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor mov @acc[2], %rax imulq %rbx # |u|*|f1| mov %rax, @acc[0] mov @acc[3], %rax mov %rdx, @acc[1] imulq %rcx # |v|*|g1| add %rax, @acc[0] adc %rdx, @acc[1] mov @acc[0], 8*13($out_ptr) # destination |v| mov @acc[1], 8*14($out_ptr) sar \$63, @acc[1] # sign extension mov @acc[1], 8*15($out_ptr) mov @acc[1], 8*16($out_ptr) mov @acc[1], 8*17($out_ptr) mov @acc[1], 8*18($out_ptr) mov @acc[1], 8*19($out_ptr) ___ for($i=2; $i<23; $i++) { my $smul_n_shift = $i<19 ? "__smulx_384_n_shift_by_31" : "__smulx_191_n_shift_by_31"; my $smul_768x63 = $i>11 ? "__smulx_768x63" : "__smulx_384x63"; $code.=<<___; xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$31, $cnt call __ab_approximation_31 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) mov $f1, 8*9(%rsp) mov $g1, 8*10(%rsp) mov \$256, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| call $smul_n_shift mov $f0, 8*7(%rsp) # corrected |f0| mov $g0, 8*8(%rsp) # corrected |g0| mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*6($out_ptr), $out_ptr # pointer to destination |b| call $smul_n_shift mov $f0, 8*9(%rsp) # corrected |f1| mov $g0, 8*10(%rsp) # corrected |g1| mov 8*7(%rsp), $f0 # |f0| mov 8*8(%rsp), $g0 # |g0| lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| lea 8*6($out_ptr), $out_ptr # pointer to destination |u| call __smulx_384x63 mov 8*9(%rsp), $f0 # |f1| mov 8*10(%rsp), $g0 # |g1| lea 8*7($out_ptr),$out_ptr # pointer to destination |v| call $smul_768x63 ___ $code.=<<___ if ($i==11); mov @acc[6], 8*7($out_ptr) # sign extension mov @acc[6], 8*8($out_ptr) mov @acc[6], 8*9($out_ptr) mov @acc[6], 8*10($out_ptr) mov @acc[6], 8*11($out_ptr) ___ } $code.=<<___; ################################# two[!] last iterations in one go xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| mov \$55, $cnt # 31 + 768 % 31 #call __ab_approximation_31 # |a| and |b| are exact, just load mov 8*0($in_ptr), @acc[0] # |a_lo| #xor @acc[1], @acc[1] # |a_hi| mov 8*6($in_ptr), @acc[2] # |b_lo| #xor @acc[3], @acc[3] # |b_hi| call __tail_loop_55 #mov $f0, 8*7(%rsp) #mov $g0, 8*8(%rsp) #mov $f1, 8*9(%rsp) #mov $g1, 8*10(%rsp) #mov 8*7(%rsp), $f0 # |f0| #mov 8*8(%rsp), $g0 # |g0| lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| #call __smulx_384x63 #mov 8*9(%rsp), $f0 # |f1| #mov 8*10(%rsp), $g0 # |g1| mov $f1, $f0 mov $g1, $g0 mov 8*4(%rsp), $out_ptr # original out_ptr call __smulx_768x63 mov 8*5(%rsp), $in_ptr # original n_ptr mov %rdx, @acc[5] # the excess limb, -1, 0 or 1 sar \$63, @acc[5] # result's sign as mask mov @acc[5], @acc[0] # mask |modulus| mov @acc[5], @acc[1] mov @acc[5], @acc[2] #ifdef __SGX_LVI_HARDENING__ lfence #endif and 8*0($in_ptr), @acc[0] and 8*1($in_ptr), @acc[1] mov @acc[5], @acc[3] and 8*2($in_ptr), @acc[2] and 8*3($in_ptr), @acc[3] mov @acc[5], @acc[4] and 8*4($in_ptr), @acc[4] and 8*5($in_ptr), @acc[5] add @acc[0], @acc[6] # conditionally add |modulus|<<384 adc @acc[1], @acc[7] adc @acc[2], @acc[8] adc @acc[3], @acc[9] adc @acc[4], %rcx adc @acc[5], %rax adc \$0, %rdx mov %rdx, @acc[5] neg %rdx or %rdx, @acc[5] # excess bit or sign as mask sar \$63, %rdx # excess bit as mask mov @acc[5], @acc[0] # mask |modulus| mov @acc[5], @acc[1] mov @acc[5], @acc[2] and 8*0($in_ptr), @acc[0] and 8*1($in_ptr), @acc[1] mov @acc[5], @acc[3] and 8*2($in_ptr), @acc[2] and 8*3($in_ptr), @acc[3] mov @acc[5], @acc[4] and 8*4($in_ptr), @acc[4] and 8*5($in_ptr), @acc[5] xor %rdx, @acc[0] # conditionally negate |modulus| xor $in_ptr, $in_ptr xor %rdx, @acc[1] sub %rdx, $in_ptr xor %rdx, @acc[2] xor %rdx, @acc[3] xor %rdx, @acc[4] xor %rdx, @acc[5] add $in_ptr, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] add @acc[0], @acc[6] # final adjustment for |modulus|<<384 adc @acc[1], @acc[7] adc @acc[2], @acc[8] adc @acc[3], @acc[9] adc @acc[4], %rcx adc @acc[5], %rax mov @acc[6], 8*6($out_ptr) # store absolute value mov @acc[7], 8*7($out_ptr) mov @acc[8], 8*8($out_ptr) mov @acc[9], 8*9($out_ptr) mov %rcx, 8*10($out_ptr) mov %rax, 8*11($out_ptr) lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size ctx_inverse_mod_384,.-ctx_inverse_mod_384 ___ ######################################################################## # Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers # to the maximum bit-length of the *result*, and "63" - to the maximum # bit-length of the |f?| and |g?| single-limb multiplicands. However! # The latter should not be taken literally, as they are always chosen so # that "bad things" don't happen. For example, there comes a point when # |v| grows beyond 384 bits, while |u| remains 384 bits wide. Yet, we # always call __smul_384x63 to perform |u|*|f0|+|v|*|g0| step. This is # because past that point |f0| is always 1 and |g0| is always 0. And, # since |u| never grows beyond 384 bits, __smul_768x63 doesn't have to # perform full-width |u|*|f1| multiplication, half-width one with sign # extension is sufficient... { my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); my $fx = @acc[9]; $code.=<<___; .type __smulx_768x63,\@abi-omnipotent .align 32 __smulx_768x63: mov 8*0($in_ptr), @acc[0] # load |u| mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov 8*6($in_ptr), @acc[6] # sign limb mov $f0, %rax sar \$63, %rax # |f0|'s sign as mask xor $fx, $fx # overrides in_ptr sub %rax, $fx # |f0|'s sign as bit mov $out_ptr, 8*1(%rsp) mov $in_ptr, 8*2(%rsp) lea 8*7($in_ptr), $in_ptr # pointer to |v| xor %rax, $f0 # conditionally negate |f0| add $fx, $f0 xor %rax, @acc[0] # conditionally negate |u| xor %rax, @acc[1] xor %rax, @acc[2] xor %rax, @acc[3] xor %rax, @acc[4] xor %rax, @acc[5] xor %rax, @acc[6] add $fx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] and $f0, @acc[6] neg @acc[6] mulx @acc[0], @acc[0], $fx # |u|*|f0| mulx @acc[1], @acc[1], %rax add $fx, @acc[1] ___ for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<=5; $i++) { $code.=<<___; mulx @acc[$i], @acc[$i], $a adc $b, @acc[$i] ___ ($a, $b) = ($b, $a); } $code.=<<___; adc %rax, @acc[6] mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) sar \$63, @acc[6] # sign extension mov @acc[6], 8*7($out_ptr) ___ { my $fx=$in_ptr; $code.=<<___; mov $g0, $f0 # load |g0| mov $g0, %rax mov 8*0($in_ptr), @acc[0] # load |v| mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov 8*6($in_ptr), @acc[6] mov 8*7($in_ptr), @acc[7] mov 8*8($in_ptr), @acc[8] mov 8*9($in_ptr), @acc[9] mov 8*10($in_ptr), @acc[10] mov 8*11($in_ptr), @acc[11] sar \$63, %rax # |g0|'s sign as mask xor $fx, $fx # overrides in_ptr sub %rax, $fx # |g0|'s sign as bit xor %rax, $f0 # conditionally negate |g0| add $fx, $f0 xor %rax, @acc[0] # conditionally negate |v| xor %rax, @acc[1] xor %rax, @acc[2] xor %rax, @acc[3] xor %rax, @acc[4] xor %rax, @acc[5] xor %rax, @acc[6] xor %rax, @acc[7] xor %rax, @acc[8] xor %rax, @acc[9] xor %rax, @acc[10] xor @acc[11], %rax add $fx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] adc \$0, @acc[7] adc \$0, @acc[8] adc \$0, @acc[9] adc \$0, @acc[10] adc \$0, %rax mulx @acc[0], @acc[0], $fx # |v|*|g0| mulx @acc[1], @acc[1], @acc[11] add $fx, @acc[1] ___ for(my ($a,$b) = ($fx, @acc[11]), $i=2; $i<11; $i++) { $code.=<<___; mulx @acc[$i], @acc[$i], $a adc $b, @acc[$i] ___ ($a, $b) = ($b, $a); } $code.=<<___; mov 8*1(%rsp), $out_ptr # restore original out_ptr adc \$0, $fx imulq %rdx add $fx, %rax adc \$0, %rdx # used in the final step add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc 8*4($out_ptr), @acc[4] adc 8*5($out_ptr), @acc[5] adc 8*6($out_ptr), @acc[6] mov 8*7($out_ptr), $fx # sign extension adc $fx, @acc[7] adc $fx, @acc[8] adc $fx, @acc[9] adc $fx, @acc[10] adc $fx, %rax adc $fx, %rdx mov 8*2(%rsp), $in_ptr # restore original in_ptr mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) mov @acc[7], 8*7($out_ptr) mov @acc[8], 8*8($out_ptr) mov @acc[9], 8*9($out_ptr) mov @acc[10], 8*10($out_ptr) mov %rax, 8*11($out_ptr) ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] .size __smulx_768x63,.-__smulx_768x63 ___ } $code.=<<___; .type __smulx_384x63,\@abi-omnipotent .align 32 __smulx_384x63: ___ for($j=0; $j<2; $j++) { my $k = 8*7*$j; $code.=<<___; mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) mov $k+8*1($in_ptr), @acc[1] mov $k+8*2($in_ptr), @acc[2] mov $k+8*3($in_ptr), @acc[3] mov $k+8*4($in_ptr), @acc[4] mov $k+8*5($in_ptr), @acc[5] mov $k+8*6($in_ptr), @acc[6] # sign/excess limb mov $f0, $fx sar \$63, $fx # |f0|'s sign as mask (or |g0|'s) xor %rax, %rax sub $fx, %rax # |f0|'s sign as bit (or |g0|'s) xor $fx, $f0 # conditionally negate |f0| add %rax, $f0 xor $fx, @acc[0] # conditionally negate |u| (or |v|) xor $fx, @acc[1] xor $fx, @acc[2] xor $fx, @acc[3] xor $fx, @acc[4] xor $fx, @acc[5] xor $fx, @acc[6] add %rax, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] and $f0, @acc[6] neg @acc[6] mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|) mulx @acc[1], @acc[1], %rax add $fx, @acc[1] ___ for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) { $code.=<<___; mulx @acc[$i], @acc[$i], $a adc $b, @acc[$i] ___ ($a, $b) = ($b, $a); } $code.=<<___ if ($j==0); mulx @acc[$i], @acc[$i], %rax mov $g0, $f0 adc $fx, @acc[$i] adc %rax, @acc[6] mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], @acc[7] mov @acc[6], @acc[8] ___ } $code.=<<___; mulx @acc[$i], @acc[$i], %rax adc $fx, @acc[$i] adc %rax, @acc[6] add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc 8*4($out_ptr), @acc[4] adc @acc[7], @acc[5] adc @acc[8], @acc[6] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) mov @acc[6], 8*6($out_ptr) ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] .size __smulx_384x63,.-__smulx_384x63 ___ ######################################################################## # Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of # the names refers to maximum bit-lengths of |a| and |b|. As already # mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always # chosen so that "bad things" don't happen. For example, so that the # sum of the products doesn't overflow, and that the final result is # never wider than inputs... { $code.=<<___; .type __smulx_384_n_shift_by_31,\@abi-omnipotent .align 32 __smulx_384_n_shift_by_31: mov $f0, @acc[8] # make backup copy ___ my $f0 = @acc[8]; for($j=0; $j<2; $j++) { my $k = 8*6*$j; $code.=<<___; mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) mov $k+8*1($in_ptr), @acc[1] mov $k+8*2($in_ptr), @acc[2] mov $k+8*3($in_ptr), @acc[3] mov $k+8*4($in_ptr), @acc[4] mov $k+8*5($in_ptr), @acc[5] mov %rdx, %rax sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) xor $fx, $fx sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) xor %rax, %rdx # conditionally negate |f0| (or |g0|) add $fx, %rdx xor %rax, @acc[0] # conditionally negate |a| (or |b|) xor %rax, @acc[1] xor %rax, @acc[2] xor %rax, @acc[3] xor %rax, @acc[4] xor %rax, @acc[5] add $fx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] and %rdx, %rax neg %rax mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) mulx @acc[1], @acc[1], @acc[6] add $fx, @acc[1] ___ for(my ($a,$b) = ($fx, @acc[6]), $i=2; $i<5; $i++) { $code.=<<___; mulx @acc[$i], @acc[$i], $a adc $b, @acc[$i] ___ ($a, $b) = ($b, $a); } $code.=<<___ if ($j==0); mulx @acc[5], @acc[5], @acc[6] adc $fx, @acc[5] adc %rax, @acc[6] mov $g0, %rdx mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) mov @acc[6], @acc[7] ___ } $code.=<<___; mulx @acc[5], @acc[5], @acc[6] adc $fx, @acc[5] adc %rax, @acc[6] add 8*0($out_ptr), @acc[0] adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc 8*4($out_ptr), @acc[4] adc 8*5($out_ptr), @acc[5] adc @acc[7], @acc[6] mov $f0, %rdx # restore the original value shrd \$31, @acc[1], @acc[0] shrd \$31, @acc[2], @acc[1] shrd \$31, @acc[3], @acc[2] shrd \$31, @acc[4], @acc[3] shrd \$31, @acc[5], @acc[4] shrd \$31, @acc[6], @acc[5] sar \$63, @acc[6] # sign as mask xor $fx, $fx sub @acc[6], $fx # sign as bit xor @acc[6], @acc[0] # conditionally negate the result xor @acc[6], @acc[1] xor @acc[6], @acc[2] xor @acc[6], @acc[3] xor @acc[6], @acc[4] xor @acc[6], @acc[5] add $fx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) xor @acc[6], %rdx # conditionally negate |f0| xor @acc[6], $g0 # conditionally negate |g0| add $fx, %rdx add $fx, $g0 ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] .size __smulx_384_n_shift_by_31,.-__smulx_384_n_shift_by_31 ___ } { $code.=<<___; .type __smulx_191_n_shift_by_31,\@abi-omnipotent .align 32 __smulx_191_n_shift_by_31: mov $f0, @acc[8] ___ my $f0 = @acc[8]; for($j=0; $j<2; $j++) { my $k = 8*6*$j; my @acc=@acc; @acc=@acc[3..5] if ($j); $code.=<<___; mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) mov $k+8*1($in_ptr), @acc[1] mov $k+8*2($in_ptr), @acc[2] mov %rdx, %rax sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) xor $fx, $fx sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) xor %rax, %rdx # conditionally negate |f0| (or |g0|) add $fx, %rdx xor %rax, @acc[0] # conditionally negate |a| (or |b|) xor %rax, @acc[1] xor @acc[2], %rax add $fx, @acc[0] adc \$0, @acc[1] adc \$0, %rax mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) mulx @acc[1], @acc[1], @acc[2] add $fx, @acc[1] adc \$0, @acc[2] imulq %rdx add %rax, @acc[2] adc \$0, %rdx ___ $code.=<<___ if ($j==0); mov %rdx, @acc[6] mov $g0, %rdx ___ } $code.=<<___; add @acc[0], @acc[3] adc @acc[1], @acc[4] adc @acc[2], @acc[5] adc %rdx, @acc[6] mov $f0, %rdx shrd \$31, @acc[4], @acc[3] shrd \$31, @acc[5], @acc[4] shrd \$31, @acc[6], @acc[5] sar \$63, @acc[6] # sign as mask xor $fx, $fx sub @acc[6], $fx # sign as bit xor @acc[6], @acc[3] # conditionally negate the result xor @acc[6], @acc[4] xor @acc[6], @acc[5] add $fx, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mov @acc[3], 8*0($out_ptr) mov @acc[4], 8*1($out_ptr) mov @acc[5], 8*2($out_ptr) xor @acc[6], %rdx # conditionally negate |f0| xor @acc[6], $g0 # conditionally negate |g0| add $fx, %rdx add $fx, $g0 ret # __SGX_LVI_HARDENING_CLOBBER__=@acc[0] .size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 ___ } } { my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); my ($a_, $b_) = ($a_lo, $b_lo); { my @a = ($a_lo, $t1, $a_hi); my @b = ($b_lo, $t2, $b_hi); $code.=<<___; .type __ab_approximation_31,\@abi-omnipotent .align 32 __ab_approximation_31: mov 8*5($in_ptr), @a[2] # load |a| in reverse order mov 8*11($in_ptr), @b[2] # load |b| in reverse order mov 8*4($in_ptr), @a[1] mov 8*10($in_ptr), @b[1] mov 8*3($in_ptr), @a[0] mov 8*9($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # check top-most limbs, ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] mov 8*2($in_ptr), @a[0] cmovz @b[0], @b[1] mov 8*8($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # ... ones before top-most, ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] mov 8*1($in_ptr), @a[0] cmovz @b[0], @b[1] mov 8*7($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # ... and ones before that ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] mov 8*0($in_ptr), @a[0] cmovz @b[0], @b[1] mov 8*6($in_ptr), @b[0] mov @a[2], $t0 or @b[2], $t0 # ... and ones before that ... cmovz @a[1], @a[2] cmovz @b[1], @b[2] cmovz @a[0], @a[1] cmovz @b[0], @b[1] mov @a[2], $t0 or @b[2], $t0 bsr $t0, %rcx lea 1(%rcx), %rcx cmovz @a[0], @a[2] cmovz @b[0], @b[2] cmovz $t0, %rcx neg %rcx #and \$63, %rcx # debugging artefact shldq %cl, @a[1], @a[2] # align second limb to the left shldq %cl, @b[1], @b[2] mov \$0x7FFFFFFF, %eax and %rax, @a[0] and %rax, @b[0] andn @a[2], %rax, @a[2] andn @b[2], %rax, @b[2] or @a[2], @a[0] or @b[2], @b[0] jmp __inner_loop_31 ret .size __ab_approximation_31,.-__ab_approximation_31 ___ } $code.=<<___; .type __inner_loop_31,\@abi-omnipotent .align 32 __inner_loop_31: ################# by Thomas Pornin mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 mov \$0x7FFFFFFF7FFFFFFF, $bias .Loop_31: cmp $b_, $a_ # if |a_|<|b_|, swap the variables mov $a_, $t0 mov $b_, $t1 mov $fg0, $t2 mov $fg1, $t3 cmovb $b_, $a_ cmovb $t0, $b_ cmovb $fg1, $fg0 cmovb $t2, $fg1 sub $b_, $a_ # |a_|-|b_| sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| add $bias, $fg0 test \$1, $t0 # if |a_| was even, roll back cmovz $t0, $a_ cmovz $t1, $b_ cmovz $t2, $fg0 cmovz $t3, $fg1 shr \$1, $a_ # |a_|>>=1 add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 sub $bias, $fg1 sub \$1, $cnt jnz .Loop_31 shr \$32, $bias mov %ecx, %edx # $fg0, $f0 mov ${fg1}d, ${f1}d shr \$32, $g0 shr \$32, $g1 sub $bias, $f0 # remove the bias sub $bias, $g0 sub $bias, $f1 sub $bias, $g1 ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo .size __inner_loop_31,.-__inner_loop_31 .type __tail_loop_55,\@abi-omnipotent .align 32 __tail_loop_55: mov \$1, $f0 # |f0|=1 xor $g0, $g0 # |g0|=0 xor $f1, $f1 # |f1|=0 mov \$1, $g1 # |g1|=1 .Loop_55: xor $t0, $t0 test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| mov $b_lo, $t1 cmovnz $b_lo, $t0 sub $a_lo, $t1 # |b_|-|a_| mov $a_lo, $t2 sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| cmovc $t2, $b_lo # |b_| = |a_| mov $f0, $t0 # exchange |f0| and |f1| cmovc $f1, $f0 cmovc $t0, $f1 mov $g0, $t1 # exchange |g0| and |g1| cmovc $g1, $g0 cmovc $t1, $g1 xor $t0, $t0 xor $t1, $t1 shr \$1, $a_lo test \$1, $t2 # if |a_| was odd, then we'll be subtracting... cmovnz $f1, $t0 cmovnz $g1, $t1 add $f1, $f1 # |f1|<<=1 add $g1, $g1 # |g1|<<=1 sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) sub \$1, $cnt jnz .Loop_55 ret # __SGX_LVI_HARDENING_CLOBBER__=$a_lo .size __tail_loop_55,.-__tail_loop_55 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/div3w-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } $code.=<<___; .text .globl div_3_limbs .hidden div_3_limbs .type div_3_limbs,%function .align 5 div_3_limbs: hint #34 ldp x4,x5,[x0] // load R eor x0,x0,x0 // Q = 0 mov x3,#64 // loop counter nop .Loop: subs x6,x4,x1 // R - D add x0,x0,x0 // Q <<= 1 sbcs x7,x5,x2 add x0,x0,#1 // Q + speculative bit csel x4,x4,x6,lo // select between R and R - D extr x1,x2,x1,#1 // D >>= 1 csel x5,x5,x7,lo lsr x2,x2,#1 sbc x0,x0,xzr // subtract speculative bit sub x3,x3,#1 cbnz x3,.Loop asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit orr x0,x0,x3 // all ones if overflow ret .size div_3_limbs,.-div_3_limbs ___ { my ($div_rem, $divisor, $quot) = map("x$_",(0..2)); my @div = map("x$_",(3..4)); my @acc = map("x$_",(5..7)); my @t = map("x$_",(8..11)); $code.=<<___; .globl quot_rem_128 .hidden quot_rem_128 .type quot_rem_128,%function .align 5 quot_rem_128: hint #34 ldp @div[0],@div[1],[$divisor] mul @acc[0],@div[0],$quot // divisor[0:1} * quotient umulh @acc[1],@div[0],$quot mul @t[3], @div[1],$quot umulh @acc[2],@div[1],$quot ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend ldr @t[2],[$div_rem,#16] adds @acc[1],@acc[1],@t[3] adc @acc[2],@acc[2],xzr subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient sbcs @t[1],@t[1],@acc[1] sbcs @t[2],@t[2],@acc[2] sbc @acc[0],xzr,xzr // borrow -> mask add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ... and @div[0],@div[0],@acc[0] and @div[1],@div[1],@acc[0] adds @t[0],@t[0],@div[0] // ... and add divisor adc @t[1],@t[1],@div[1] stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder str $quot,[$div_rem,#16] // and one limb of the quotient mov x0,$quot // return adjusted quotient ret .size quot_rem_128,.-quot_rem_128 .globl quot_rem_64 .hidden quot_rem_64 .type quot_rem_64,%function .align 5 quot_rem_64: hint #34 ldr @div[0],[$divisor] ldr @t[0],[$div_rem] // load 1 limb of the dividend mul @acc[0],@div[0],$quot // divisor * quotient sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient stp @t[0],$quot,[$div_rem] // save remainder and quotient mov x0,$quot // return quotient ret .size quot_rem_64,.-quot_rem_64 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/div3w-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; $c_ref=<<'___'; /* * |div_top| points at two most significant limbs of the dividend, |d_hi| * and |d_lo| are two most significant limbs of the divisor. If divisor * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. * The divisor is required to be "bitwise left-aligned," and dividend's * top limbs to be not larger than the divisor's. The latter limitation * can be problematic in the first iteration of multi-precision division, * where in most general case the condition would have to be "smaller." * The subroutine considers four limbs, two of which are "overlapping," * hence the name... Another way to look at it is to think of the pair * of the dividend's limbs being suffixed with a zero: * +-------+-------+-------+ * R | | | 0 | * +-------+-------+-------+ * +-------+-------+ * D | | | * +-------+-------+ */ limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi) { llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0]; llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo; limb_t Q = 0, mask; size_t i; for (i = 0; i < LIMB_BITS; i++) { Q <<= 1; mask = (R >= D); Q |= mask; R -= (D & ((llimb_t)0 - mask)); D >>= 1; } mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */ Q <<= 1; Q |= (R >= D); return (Q | mask); } ___ $code.=<<___; .text .globl div_3_limbs .hidden div_3_limbs .type div_3_limbs,\@function,3,"unwind" .align 32 div_3_limbs: .cfi_startproc .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov (%rdi),%r8 # load R.lo mov 8(%rdi),%r9 # load R.hi xor %rax,%rax # Q = 0 mov \$64,%ecx # loop counter .Loop: mov %r8,%r10 # put aside R sub %rsi,%r8 # R -= D mov %r9,%r11 sbb %rdx,%r9 lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit mov %rdx,%rdi cmovc %r10,%r8 # restore R if R - D borrowed cmovc %r11,%r9 sbb \$0,%rax # subtract speculative bit shl \$63,%rdi shr \$1,%rsi shr \$1,%rdx or %rdi,%rsi # D >>= 1 sub \$1,%ecx jnz .Loop lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit sar \$63,%rax # top bit -> mask sub %rsi,%r8 # R -= D sbb %rdx,%r9 sbb \$0,%rcx # subtract speculative bit or %rcx,%rax # all ones if overflow .cfi_epilogue ret .cfi_endproc .size div_3_limbs,.-div_3_limbs ___ ######################################################################## # Calculate remainder and adjust the quotient, which can be off-by-one. # Then save quotient in limb next to top limb of the remainder. There is # place, because the remainder/next-iteration-dividend gets shorter by # one limb. { my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx"); my @acc = ("%r8", "%r9", "%rdx"); my @tmp = ("%r10", "%r11", "%rax"); $code.=<<___; .globl quot_rem_128 .hidden quot_rem_128 .type quot_rem_128,\@function,3,"unwind" .align 32 quot_rem_128: .cfi_startproc .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov %rdx, %rax mov %rdx, $quotient mulq 0($divisor) # divisor[0:1] * quotient mov %rax, @acc[0] mov $quotient, %rax mov %rdx, @acc[1] mulq 8($divisor) add %rax, @acc[1] adc \$0, %rdx # %rdx is @acc[2] mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend mov 8($div_rem), @tmp[1] mov 16($div_rem), @tmp[2] sub @acc[0], @tmp[0] # dividend - divisor * quotient sbb @acc[1], @tmp[1] sbb @acc[2], @tmp[2] sbb @acc[0], @acc[0] # borrow -> mask add @acc[0], $quotient # if borrowed, adjust the quotient ... mov @acc[0], @acc[1] and 0($divisor), @acc[0] and 8($divisor), @acc[1] add @acc[0], @tmp[0] # ... and add divisor adc @acc[1], @tmp[1] mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ... mov @tmp[1], 8($div_rem) mov $quotient, 16($div_rem) # ... and 1 limb of the quotient mov $quotient, %rax # return adjusted quotient .cfi_epilogue ret .cfi_endproc .size quot_rem_128,.-quot_rem_128 ######################################################################## # Unlike 128-bit case above, quotient is exact. As result just one limb # of the dividend is sufficient to calculate the remainder... .globl quot_rem_64 .hidden quot_rem_64 .type quot_rem_64,\@function,3,"unwind" .align 32 quot_rem_64: .cfi_startproc .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif mov %rdx, %rax # return quotient imulq 0($divisor), %rdx # divisor[0] * quotient mov 0($div_rem), @tmp[0] # load 1 limb of the dividend sub %rdx, @tmp[0] # dividend - divisor * quotient mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ... mov %rax, 8($div_rem) # ... and 1 limb of the quotient .cfi_epilogue ret .cfi_endproc .size quot_rem_64,.-quot_rem_64 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/mul_mont_256-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # As for "sparse" in subroutine names, see commentary in the # asm/mulx_mont_256-x86_64.pl module. $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } ($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); @mod=map("x$_",(5..8)); $bi="x9"; @a=map("x$_",(10..13)); @tmp=map("x$_",(14..17)); @acc=map("x$_",(19..24)); $m0=$n_ptr; $code.=<<___; .text .globl mul_mont_sparse_256 .hidden mul_mont_sparse_256 .type mul_mont_sparse_256,%function .align 5 mul_mont_sparse_256: hint #34 stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldr $bi, [$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] mul @acc[0],@a[0],$bi ldp @mod[0],@mod[1],[$n_ptr] mul @acc[1],@a[1],$bi ldp @mod[2],@mod[3],[$n_ptr,#16] mul @acc[2],@a[2],$bi mul @acc[3],@a[3],$bi umulh @tmp[0],@a[0],$bi umulh @tmp[1],@a[1],$bi mul $m0,$n0,@acc[0] umulh @tmp[2],@a[2],$bi umulh @tmp[3],@a[3],$bi adds @acc[1],@acc[1],@tmp[0] //mul @tmp[0],@mod[0],$m0 adcs @acc[2],@acc[2],@tmp[1] mul @tmp[1],@mod[1],$m0 adcs @acc[3],@acc[3],@tmp[2] mul @tmp[2],@mod[2],$m0 adc @acc[4],xzr, @tmp[3] mul @tmp[3],@mod[3],$m0 ___ for ($i=1;$i<4;$i++) { $code.=<<___; ldr $bi,[$b_ptr,8*$i] subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@mod[0],$m0 adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@mod[1],$m0 adcs @acc[2],@acc[2],@tmp[2] umulh @tmp[2],@mod[2],$m0 adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@mod[3],$m0 adc @acc[4],@acc[4],xzr adds @acc[0],@acc[1],@tmp[0] mul @tmp[0],@a[0],$bi adcs @acc[1],@acc[2],@tmp[1] mul @tmp[1],@a[1],$bi adcs @acc[2],@acc[3],@tmp[2] mul @tmp[2],@a[2],$bi adcs @acc[3],@acc[4],@tmp[3] mul @tmp[3],@a[3],$bi adc @acc[4],xzr,xzr adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@a[0],$bi adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@a[1],$bi adcs @acc[2],@acc[2],@tmp[2] mul $m0,$n0,@acc[0] umulh @tmp[2],@a[2],$bi adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@a[3],$bi adc @acc[4],@acc[4],xzr adds @acc[1],@acc[1],@tmp[0] //mul @tmp[0],@mod[0],$m0 adcs @acc[2],@acc[2],@tmp[1] mul @tmp[1],@mod[1],$m0 adcs @acc[3],@acc[3],@tmp[2] mul @tmp[2],@mod[2],$m0 adc @acc[4],@acc[4],@tmp[3] mul @tmp[3],@mod[3],$m0 ___ } $code.=<<___; subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@mod[0],$m0 adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@mod[1],$m0 adcs @acc[2],@acc[2],@tmp[2] umulh @tmp[2],@mod[2],$m0 adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@mod[3],$m0 adc @acc[4],@acc[4],xzr adds @acc[0],@acc[1],@tmp[0] adcs @acc[1],@acc[2],@tmp[1] adcs @acc[2],@acc[3],@tmp[2] adcs @acc[3],@acc[4],@tmp[3] adc @acc[4],xzr,xzr subs @tmp[0],@acc[0],@mod[0] sbcs @tmp[1],@acc[1],@mod[1] sbcs @tmp[2],@acc[2],@mod[2] sbcs @tmp[3],@acc[3],@mod[3] sbcs xzr, @acc[4],xzr csel @acc[0],@acc[0],@tmp[0],lo csel @acc[1],@acc[1],@tmp[1],lo csel @acc[2],@acc[2],@tmp[2],lo csel @acc[3],@acc[3],@tmp[3],lo stp @acc[0],@acc[1],[$r_ptr] stp @acc[2],@acc[3],[$r_ptr,#16] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldr c29,[csp],#8*__SIZEOF_POINTER__ ret .size mul_mont_sparse_256,.-mul_mont_sparse_256 ___ { my @acc = (@a,@acc[0..3]); my @a = @mod; $code.=<<___; .globl sqr_mont_sparse_256 .hidden sqr_mont_sparse_256 .type sqr_mont_sparse_256,%function .align 5 sqr_mont_sparse_256: paciasp stp c29,c30,[csp,#-6*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] mov $n0,$n_ptr //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x] // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul @acc[1],@a[1],@a[0] // a[1]*a[0] umulh @tmp[1],@a[1],@a[0] mul @acc[2],@a[2],@a[0] // a[2]*a[0] umulh @tmp[2],@a[2],@a[0] mul @acc[3],@a[3],@a[0] // a[3]*a[0] umulh @acc[4],@a[3],@a[0] adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication mul @tmp[0],@a[2],@a[1] // a[2]*a[1] umulh @tmp[1],@a[2],@a[1] adcs @acc[3],@acc[3],@tmp[2] mul @tmp[2],@a[3],@a[1] // a[3]*a[1] umulh @tmp[3],@a[3],@a[1] adc @acc[4],@acc[4],xzr // can't overflow mul @acc[5],@a[3],@a[2] // a[3]*a[2] umulh @acc[6],@a[3],@a[2] adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication mul @acc[0],@a[0],@a[0] // a[0]*a[0] adc @tmp[2],@tmp[3],xzr // can't overflow adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication umulh @a[0],@a[0],@a[0] adcs @acc[4],@acc[4],@tmp[1] mul @tmp[1],@a[1],@a[1] // a[1]*a[1] adcs @acc[5],@acc[5],@tmp[2] umulh @a[1],@a[1],@a[1] adc @acc[6],@acc[6],xzr // can't overflow adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2 mul @tmp[2],@a[2],@a[2] // a[2]*a[2] adcs @acc[2],@acc[2],@acc[2] umulh @a[2],@a[2],@a[2] adcs @acc[3],@acc[3],@acc[3] mul @tmp[3],@a[3],@a[3] // a[3]*a[3] adcs @acc[4],@acc[4],@acc[4] umulh @a[3],@a[3],@a[3] adcs @acc[5],@acc[5],@acc[5] adcs @acc[6],@acc[6],@acc[6] adc @acc[7],xzr,xzr adds @acc[1],@acc[1],@a[0] // +a[i]*a[i] adcs @acc[2],@acc[2],@tmp[1] adcs @acc[3],@acc[3],@a[1] adcs @acc[4],@acc[4],@tmp[2] adcs @acc[5],@acc[5],@a[2] adcs @acc[6],@acc[6],@tmp[3] adc @acc[7],@acc[7],@a[3] bl __mul_by_1_mont_256 ldr c30,[c29,#__SIZEOF_POINTER__] adds @acc[0],@acc[0],@acc[4] // accumulate upper half adcs @acc[1],@acc[1],@acc[5] adcs @acc[2],@acc[2],@acc[6] adcs @acc[3],@acc[3],@acc[7] adc @acc[4],xzr,xzr subs @tmp[0],@acc[0],@mod[0] sbcs @tmp[1],@acc[1],@mod[1] sbcs @tmp[2],@acc[2],@mod[2] sbcs @tmp[3],@acc[3],@mod[3] sbcs xzr, @acc[4],xzr csel @acc[0],@acc[0],@tmp[0],lo csel @acc[1],@acc[1],@tmp[1],lo csel @acc[2],@acc[2],@tmp[2],lo csel @acc[3],@acc[3],@tmp[3],lo stp @acc[0],@acc[1],[$r_ptr] stp @acc[2],@acc[3],[$r_ptr,#16] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldr c29,[csp],#6*__SIZEOF_POINTER__ autiasp ret .size sqr_mont_sparse_256,.-sqr_mont_sparse_256 ___ } { my @a = (@a, $bi); $code.=<<___; .globl from_mont_256 .hidden from_mont_256 .type from_mont_256,%function .align 5 from_mont_256: paciasp stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! add c29,csp,#0 mov $n0,$n_ptr ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] bl __mul_by_1_mont_256 ldr c30,[c29,#__SIZEOF_POINTER__] subs @tmp[0],@a[0],@mod[0] sbcs @tmp[1],@a[1],@mod[1] sbcs @tmp[2],@a[2],@mod[2] sbcs @tmp[3],@a[3],@mod[3] csel @a[0],@a[0],@tmp[0],lo csel @a[1],@a[1],@tmp[1],lo csel @a[2],@a[2],@tmp[2],lo csel @a[3],@a[3],@tmp[3],lo stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] ldr c29,[csp],#2*__SIZEOF_POINTER__ autiasp ret .size from_mont_256,.-from_mont_256 .globl redc_mont_256 .hidden redc_mont_256 .type redc_mont_256,%function .align 5 redc_mont_256: paciasp stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! add c29,csp,#0 mov $n0,$n_ptr ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] bl __mul_by_1_mont_256 ldr c30,[c29,#__SIZEOF_POINTER__] ldp @tmp[0],@tmp[1],[$a_ptr,#32] ldp @tmp[2],@tmp[3],[$a_ptr,#48] adds @a[0],@a[0],@tmp[0] adcs @a[1],@a[1],@tmp[1] adcs @a[2],@a[2],@tmp[2] adcs @a[3],@a[3],@tmp[3] adc @a[4],xzr,xzr subs @tmp[0],@a[0],@mod[0] sbcs @tmp[1],@a[1],@mod[1] sbcs @tmp[2],@a[2],@mod[2] sbcs @tmp[3],@a[3],@mod[3] sbcs xzr, @a[4],xzr csel @a[0],@a[0],@tmp[0],lo csel @a[1],@a[1],@tmp[1],lo csel @a[2],@a[2],@tmp[2],lo csel @a[3],@a[3],@tmp[3],lo stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] ldr c29,[csp],#2*__SIZEOF_POINTER__ autiasp ret .size redc_mont_256,.-redc_mont_256 .type __mul_by_1_mont_256,%function .align 5 __mul_by_1_mont_256: mul $m0,$n0,@a[0] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ___ for ($i=1;$i<4;$i++) { $code.=<<___; //mul @tmp[0],@mod[0],$m0 mul @tmp[1],@mod[1],$m0 mul @tmp[2],@mod[2],$m0 mul @tmp[3],@mod[3],$m0 subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] umulh @tmp[0],@mod[0],$m0 adcs @a[1],@a[1],@tmp[1] umulh @tmp[1],@mod[1],$m0 adcs @a[2],@a[2],@tmp[2] umulh @tmp[2],@mod[2],$m0 adcs @a[3],@a[3],@tmp[3] umulh @tmp[3],@mod[3],$m0 adc @a[4],xzr,xzr adds @a[0],@a[1],@tmp[0] adcs @a[1],@a[2],@tmp[1] adcs @a[2],@a[3],@tmp[2] mul $m0,$n0,@a[0] adc @a[3],@a[4],@tmp[3] ___ } $code.=<<___; //mul @tmp[0],@mod[0],$m0 mul @tmp[1],@mod[1],$m0 mul @tmp[2],@mod[2],$m0 mul @tmp[3],@mod[3],$m0 subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] umulh @tmp[0],@mod[0],$m0 adcs @a[1],@a[1],@tmp[1] umulh @tmp[1],@mod[1],$m0 adcs @a[2],@a[2],@tmp[2] umulh @tmp[2],@mod[2],$m0 adcs @a[3],@a[3],@tmp[3] umulh @tmp[3],@mod[3],$m0 adc @a[4],xzr,xzr adds @a[0],@a[1],@tmp[0] adcs @a[1],@a[2],@tmp[1] adcs @a[2],@a[3],@tmp[2] adc @a[3],@a[4],@tmp[3] ret .size __mul_by_1_mont_256,.-__mul_by_1_mont_256 ___ } print $code; close STDOUT; ================================================ FILE: src/asm/mul_mont_384-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } ($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); @mod = map("x$_",(5..10)); @a = map("x$_",(11..16)); $bi = "x17"; @acc = map("x$_",(19..25)); @tmp = map("x$_",(26..28,0,1,3)); $code.=<<___; .text .globl add_mod_384x384 .hidden add_mod_384x384 .type add_mod_384x384,%function .align 5 add_mod_384x384: paciasp stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __add_mod_384x384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldr c29,[csp],#8*__SIZEOF_POINTER__ autiasp ret .size add_mod_384x384,.-add_mod_384x384 .type __add_mod_384x384,%function .align 5 __add_mod_384x384: ldp @a[0], @a[1], [$a_ptr] ldp @acc[0],@acc[1],[$b_ptr] ldp @a[2], @a[3], [$a_ptr,#16] adds @a[0],@a[0],@acc[0] ldp @acc[2],@acc[3],[$b_ptr,#16] adcs @a[1],@a[1],@acc[1] ldp @a[4], @a[5], [$a_ptr,#32] adcs @a[2],@a[2],@acc[2] ldp @acc[4],@acc[5],[$b_ptr,#32] adcs @a[3],@a[3],@acc[3] stp @a[0], @a[1], [$r_ptr] adcs @a[4],@a[4],@acc[4] ldp @a[0], @a[1], [$a_ptr,#48] adcs @a[5],@a[5],@acc[5] ldp @acc[0],@acc[1],[$b_ptr,#48] stp @a[2], @a[3], [$r_ptr,#16] ldp @a[2], @a[3], [$a_ptr,#64] ldp @acc[2],@acc[3],[$b_ptr,#64] adcs @a[0],@a[0],@acc[0] stp @a[4], @a[5], [$r_ptr,#32] adcs @a[1],@a[1],@acc[1] ldp @a[4], @a[5], [$a_ptr,#80] adcs @a[2],@a[2],@acc[2] ldp @acc[4],@acc[5],[$b_ptr,#80] adcs @a[3],@a[3],@acc[3] adcs @a[4],@a[4],@acc[4] adcs @a[5],@a[5],@acc[5] adc $bi,xzr,xzr subs @acc[0],@a[0],@mod[0] sbcs @acc[1],@a[1],@mod[1] sbcs @acc[2],@a[2],@mod[2] sbcs @acc[3],@a[3],@mod[3] sbcs @acc[4],@a[4],@mod[4] sbcs @acc[5],@a[5],@mod[5] sbcs xzr,$bi,xzr csel @a[0],@a[0],@acc[0],lo csel @a[1],@a[1],@acc[1],lo csel @a[2],@a[2],@acc[2],lo csel @a[3],@a[3],@acc[3],lo stp @a[0],@a[1],[$r_ptr,#48] csel @a[4],@a[4],@acc[4],lo stp @a[2],@a[3],[$r_ptr,#64] csel @a[5],@a[5],@acc[5],lo stp @a[4],@a[5],[$r_ptr,#80] ret .size __add_mod_384x384,.-__add_mod_384x384 .globl sub_mod_384x384 .hidden sub_mod_384x384 .type sub_mod_384x384,%function .align 5 sub_mod_384x384: paciasp stp c29,c30,[csp,#-8*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __sub_mod_384x384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldr c29,[csp],#8*__SIZEOF_POINTER__ autiasp ret .size sub_mod_384x384,.-sub_mod_384x384 .type __sub_mod_384x384,%function .align 5 __sub_mod_384x384: ldp @a[0], @a[1], [$a_ptr] ldp @acc[0],@acc[1],[$b_ptr] ldp @a[2], @a[3], [$a_ptr,#16] subs @a[0],@a[0],@acc[0] ldp @acc[2],@acc[3],[$b_ptr,#16] sbcs @a[1],@a[1],@acc[1] ldp @a[4], @a[5], [$a_ptr,#32] sbcs @a[2],@a[2],@acc[2] ldp @acc[4],@acc[5],[$b_ptr,#32] sbcs @a[3],@a[3],@acc[3] stp @a[0], @a[1], [$r_ptr] sbcs @a[4],@a[4],@acc[4] ldp @a[0], @a[1], [$a_ptr,#48] sbcs @a[5],@a[5],@acc[5] ldp @acc[0],@acc[1],[$b_ptr,#48] stp @a[2], @a[3], [$r_ptr,#16] ldp @a[2], @a[3], [$a_ptr,#64] ldp @acc[2],@acc[3],[$b_ptr,#64] sbcs @a[0],@a[0],@acc[0] stp @a[4], @a[5], [$r_ptr,#32] sbcs @a[1],@a[1],@acc[1] ldp @a[4], @a[5], [$a_ptr,#80] sbcs @a[2],@a[2],@acc[2] ldp @acc[4],@acc[5],[$b_ptr,#80] sbcs @a[3],@a[3],@acc[3] sbcs @a[4],@a[4],@acc[4] sbcs @a[5],@a[5],@acc[5] sbc $bi,xzr,xzr and @acc[0],@mod[0],$bi and @acc[1],@mod[1],$bi adds @a[0],@a[0],@acc[0] and @acc[2],@mod[2],$bi adcs @a[1],@a[1],@acc[1] and @acc[3],@mod[3],$bi adcs @a[2],@a[2],@acc[2] and @acc[4],@mod[4],$bi adcs @a[3],@a[3],@acc[3] and @acc[5],@mod[5],$bi adcs @a[4],@a[4],@acc[4] stp @a[0],@a[1],[$r_ptr,#48] adc @a[5],@a[5],@acc[5] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ret .size __sub_mod_384x384,.-__sub_mod_384x384 .type __add_mod_384,%function .align 5 __add_mod_384: ldp @a[0], @a[1], [$a_ptr] ldp @acc[0],@acc[1],[$b_ptr] ldp @a[2], @a[3], [$a_ptr,#16] adds @a[0],@a[0],@acc[0] ldp @acc[2],@acc[3],[$b_ptr,#16] adcs @a[1],@a[1],@acc[1] ldp @a[4], @a[5], [$a_ptr,#32] adcs @a[2],@a[2],@acc[2] ldp @acc[4],@acc[5],[$b_ptr,#32] adcs @a[3],@a[3],@acc[3] adcs @a[4],@a[4],@acc[4] adcs @a[5],@a[5],@acc[5] adc $bi,xzr,xzr subs @acc[0],@a[0],@mod[0] sbcs @acc[1],@a[1],@mod[1] sbcs @acc[2],@a[2],@mod[2] sbcs @acc[3],@a[3],@mod[3] sbcs @acc[4],@a[4],@mod[4] sbcs @acc[5],@a[5],@mod[5] sbcs xzr,$bi,xzr csel @a[0],@a[0],@acc[0],lo csel @a[1],@a[1],@acc[1],lo csel @a[2],@a[2],@acc[2],lo csel @a[3],@a[3],@acc[3],lo csel @a[4],@a[4],@acc[4],lo stp @a[0],@a[1],[$r_ptr] csel @a[5],@a[5],@acc[5],lo stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ret .size __add_mod_384,.-__add_mod_384 .type __sub_mod_384,%function .align 5 __sub_mod_384: ldp @a[0], @a[1], [$a_ptr] ldp @acc[0],@acc[1],[$b_ptr] ldp @a[2], @a[3], [$a_ptr,#16] subs @a[0],@a[0],@acc[0] ldp @acc[2],@acc[3],[$b_ptr,#16] sbcs @a[1],@a[1],@acc[1] ldp @a[4], @a[5], [$a_ptr,#32] sbcs @a[2],@a[2],@acc[2] ldp @acc[4],@acc[5],[$b_ptr,#32] sbcs @a[3],@a[3],@acc[3] sbcs @a[4],@a[4],@acc[4] sbcs @a[5],@a[5],@acc[5] sbc $bi,xzr,xzr and @acc[0],@mod[0],$bi and @acc[1],@mod[1],$bi adds @a[0],@a[0],@acc[0] and @acc[2],@mod[2],$bi adcs @a[1],@a[1],@acc[1] and @acc[3],@mod[3],$bi adcs @a[2],@a[2],@acc[2] and @acc[4],@mod[4],$bi adcs @a[3],@a[3],@acc[3] and @acc[5],@mod[5],$bi adcs @a[4],@a[4],@acc[4] stp @a[0],@a[1],[$r_ptr] adc @a[5],@a[5],@acc[5] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ret .size __sub_mod_384,.-__sub_mod_384 .globl mul_mont_384x .hidden mul_mont_384x .type mul_mont_384x,%function .align 5 mul_mont_384x: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] sub csp,csp,#288 // space for 3 768-bit vectors cmov @tmp[0],$r_ptr // save r_ptr cmov @tmp[1],$a_ptr // save b_ptr cmov @tmp[2],$b_ptr // save b_ptr cadd $r_ptr,sp,#0 // mul_384(t0, a->re, b->re) bl __mul_384 cadd $a_ptr,$a_ptr,#48 // mul_384(t1, a->im, b->im) cadd $b_ptr,$b_ptr,#48 cadd $r_ptr,sp,#96 bl __mul_384 ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] csub $b_ptr,$a_ptr,#48 cadd $r_ptr,sp,#240 bl __add_mod_384 cadd $a_ptr,@tmp[2],#0 cadd $b_ptr,@tmp[2],#48 cadd $r_ptr,sp,#192 // t2 bl __add_mod_384 cadd $a_ptr,$r_ptr,#0 cadd $b_ptr,$r_ptr,#48 bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] cmov $a_ptr,$r_ptr cadd $b_ptr,sp,#0 bl __sub_mod_384x384 cadd $b_ptr,sp,#96 bl __sub_mod_384x384 // t2 = t2-t0-t1 cadd $a_ptr,sp,#0 cadd $b_ptr,sp,#96 cadd $r_ptr,sp,#0 bl __sub_mod_384x384 // t0 = t0-t1 cadd $a_ptr,sp,#0 // ret->re = redc(t0) cadd $r_ptr,@tmp[0],#0 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 cadd $a_ptr,sp,#192 // ret->im = redc(t2) cadd $r_ptr,$r_ptr,#48 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] add csp,csp,#288 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size mul_mont_384x,.-mul_mont_384x .globl sqr_mont_384x .hidden sqr_mont_384x .type sqr_mont_384x,%function .align 5 sqr_mont_384x: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c3,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub csp,csp,#96 // space for 2 384-bit vectors mov $n0,$n_ptr // adjust for missing b_ptr ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] cadd $b_ptr,$a_ptr,#48 cadd $r_ptr,sp,#0 bl __add_mod_384 // t0 = a->re + a->im cadd $r_ptr,sp,#48 bl __sub_mod_384 // t1 = a->re - a->im ldp @a[0],@a[1],[$a_ptr] ldr $bi, [$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) adds @a[0],@a[0],@a[0] // add with itself adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc @acc[6],xzr,xzr subs @acc[0],@a[0],@mod[0] sbcs @acc[1],@a[1],@mod[1] sbcs @acc[2],@a[2],@mod[2] sbcs @acc[3],@a[3],@mod[3] sbcs @acc[4],@a[4],@mod[4] sbcs @acc[5],@a[5],@mod[5] sbcs xzr,@acc[6],xzr csel @acc[0],@a[0],@acc[0],lo csel @acc[1],@a[1],@acc[1],lo csel @acc[2],@a[2],@acc[2],lo ldp @a[0],@a[1],[sp] csel @acc[3],@a[3],@acc[3],lo ldr $bi, [sp,#48] csel @acc[4],@a[4],@acc[4],lo ldp @a[2],@a[3],[sp,#16] csel @acc[5],@a[5],@acc[5],lo ldp @a[4],@a[5],[sp,#32] stp @acc[0],@acc[1],[$b_ptr,#48] stp @acc[2],@acc[3],[$b_ptr,#64] stp @acc[4],@acc[5],[$b_ptr,#80] cadd $b_ptr,sp,#48 bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) ldr c30,[c29,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$b_ptr] stp @a[2],@a[3],[$b_ptr,#16] stp @a[4],@a[5],[$b_ptr,#32] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sqr_mont_384x,.-sqr_mont_384x .globl mul_mont_384 .hidden mul_mont_384 .type mul_mont_384,%function .align 5 mul_mont_384: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c4,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there ldp @a[0],@a[1],[$a_ptr] ldr $bi, [$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __mul_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$b_ptr] stp @a[2],@a[3],[$b_ptr,#16] stp @a[4],@a[5],[$b_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size mul_mont_384,.-mul_mont_384 .type __mul_mont_384,%function .align 5 __mul_mont_384: mul @acc[0],@a[0],$bi mul @acc[1],@a[1],$bi mul @acc[2],@a[2],$bi mul @acc[3],@a[3],$bi mul @acc[4],@a[4],$bi mul @acc[5],@a[5],$bi mul $n0,$n0,@acc[0] umulh @tmp[0],@a[0],$bi umulh @tmp[1],@a[1],$bi umulh @tmp[2],@a[2],$bi umulh @tmp[3],@a[3],$bi umulh @tmp[4],@a[4],$bi umulh @tmp[5],@a[5],$bi adds @acc[1],@acc[1],@tmp[0] // mul @tmp[0],@mod[0],$n0 adcs @acc[2],@acc[2],@tmp[1] mul @tmp[1],@mod[1],$n0 adcs @acc[3],@acc[3],@tmp[2] mul @tmp[2],@mod[2],$n0 adcs @acc[4],@acc[4],@tmp[3] mul @tmp[3],@mod[3],$n0 adcs @acc[5],@acc[5],@tmp[4] mul @tmp[4],@mod[4],$n0 adc @acc[6],xzr, @tmp[5] mul @tmp[5],@mod[5],$n0 mov $bi,xzr ___ for ($i=1;$i<6;$i++) { $code.=<<___; subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@mod[0],$n0 adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@mod[1],$n0 adcs @acc[2],@acc[2],@tmp[2] umulh @tmp[2],@mod[2],$n0 adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@mod[3],$n0 adcs @acc[4],@acc[4],@tmp[4] umulh @tmp[4],@mod[4],$n0 adcs @acc[5],@acc[5],@tmp[5] umulh @tmp[5],@mod[5],$n0 adcs @acc[6],@acc[6],xzr adc $n0,$bi,xzr ldr $bi,[$b_ptr,8*$i] adds @acc[0],@acc[1],@tmp[0] mul @tmp[0],@a[0],$bi adcs @acc[1],@acc[2],@tmp[1] mul @tmp[1],@a[1],$bi adcs @acc[2],@acc[3],@tmp[2] mul @tmp[2],@a[2],$bi adcs @acc[3],@acc[4],@tmp[3] mul @tmp[3],@a[3],$bi adcs @acc[4],@acc[5],@tmp[4] mul @tmp[4],@a[4],$bi adcs @acc[5],@acc[6],@tmp[5] mul @tmp[5],@a[5],$bi adc @acc[6],$n0,xzr ldr $n0,[x29,#12*__SIZEOF_POINTER__] adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@a[0],$bi adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@a[1],$bi adcs @acc[2],@acc[2],@tmp[2] mul $n0,$n0,@acc[0] umulh @tmp[2],@a[2],$bi adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@a[3],$bi adcs @acc[4],@acc[4],@tmp[4] umulh @tmp[4],@a[4],$bi adcs @acc[5],@acc[5],@tmp[5] umulh @tmp[5],@a[5],$bi adcs @acc[6],@acc[6],xzr adc $bi,xzr,xzr adds @acc[1],@acc[1],@tmp[0] // mul @tmp[0],@mod[0],$n0 adcs @acc[2],@acc[2],@tmp[1] mul @tmp[1],@mod[1],$n0 adcs @acc[3],@acc[3],@tmp[2] mul @tmp[2],@mod[2],$n0 adcs @acc[4],@acc[4],@tmp[3] mul @tmp[3],@mod[3],$n0 adcs @acc[5],@acc[5],@tmp[4] mul @tmp[4],@mod[4],$n0 adcs @acc[6],@acc[6],@tmp[5] mul @tmp[5],@mod[5],$n0 adc $bi,$bi,xzr ___ } $code.=<<___; subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@mod[0],$n0 adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@mod[1],$n0 adcs @acc[2],@acc[2],@tmp[2] umulh @tmp[2],@mod[2],$n0 adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@mod[3],$n0 adcs @acc[4],@acc[4],@tmp[4] umulh @tmp[4],@mod[4],$n0 adcs @acc[5],@acc[5],@tmp[5] umulh @tmp[5],@mod[5],$n0 adcs @acc[6],@acc[6],xzr ldp c4,c2,[c29,#12*__SIZEOF_POINTER__] // pull r_ptr adc $bi,$bi,xzr adds @acc[0],@acc[1],@tmp[0] adcs @acc[1],@acc[2],@tmp[1] adcs @acc[2],@acc[3],@tmp[2] adcs @acc[3],@acc[4],@tmp[3] adcs @acc[4],@acc[5],@tmp[4] adcs @acc[5],@acc[6],@tmp[5] adc @acc[6],$bi,xzr subs @tmp[0],@acc[0],@mod[0] sbcs @tmp[1],@acc[1],@mod[1] sbcs @tmp[2],@acc[2],@mod[2] sbcs @tmp[3],@acc[3],@mod[3] sbcs @tmp[4],@acc[4],@mod[4] sbcs @tmp[5],@acc[5],@mod[5] sbcs xzr, @acc[6],xzr csel @a[0],@acc[0],@tmp[0],lo csel @a[1],@acc[1],@tmp[1],lo csel @a[2],@acc[2],@tmp[2],lo csel @a[3],@acc[3],@tmp[3],lo csel @a[4],@acc[4],@tmp[4],lo csel @a[5],@acc[5],@tmp[5],lo ret .size __mul_mont_384,.-__mul_mont_384 .globl sqr_mont_384 .hidden sqr_mont_384 .type sqr_mont_384,%function .align 5 sqr_mont_384: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] sub csp,csp,#96 // space for 768-bit vector cmov $n0,$n_ptr // adjust for missing b_ptr cmov $n_ptr,$r_ptr // save r_ptr cmov $r_ptr,sp ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] bl __sqr_384 ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] cmov $a_ptr,sp cmov $r_ptr,$n_ptr // restore r_ptr bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sqr_mont_384,.-sqr_mont_384 .globl sqr_n_mul_mont_383 .hidden sqr_n_mul_mont_383 .type sqr_n_mul_mont_383,%function .align 5 sqr_n_mul_mont_383: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c4,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub csp,csp,#96 // space for 768-bit vector cmov $bi,x5 // save b_ptr ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] cmov $r_ptr,sp .Loop_sqr_383: bl __sqr_384 sub $b_ptr,$b_ptr,#1 // counter ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] cmov $a_ptr,sp bl __mul_by_1_mont_384 ldp @acc[0],@acc[1],[$a_ptr,#48] ldp @acc[2],@acc[3],[$a_ptr,#64] ldp @acc[4],@acc[5],[$a_ptr,#80] adds @a[0],@a[0],@acc[0] // just accumulate upper half adcs @a[1],@a[1],@acc[1] adcs @a[2],@a[2],@acc[2] adcs @a[3],@a[3],@acc[3] adcs @a[4],@a[4],@acc[4] adc @a[5],@a[5],@acc[5] cbnz $b_ptr,.Loop_sqr_383 cmov $b_ptr,$bi ldr $bi,[$bi] bl __mul_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] stp @a[0],@a[1],[$b_ptr] stp @a[2],@a[3],[$b_ptr,#16] stp @a[4],@a[5],[$b_ptr,#32] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 ___ { my @acc=(@acc,@tmp[0..2]); $code.=<<___; .type __sqr_384,%function .align 5 __sqr_384: mul @acc[0],@a[1],@a[0] mul @acc[1],@a[2],@a[0] mul @acc[2],@a[3],@a[0] mul @acc[3],@a[4],@a[0] mul @acc[4],@a[5],@a[0] umulh @mod[1],@a[1],@a[0] umulh @mod[2],@a[2],@a[0] umulh @mod[3],@a[3],@a[0] umulh @mod[4],@a[4],@a[0] adds @acc[1],@acc[1],@mod[1] umulh @mod[5],@a[5],@a[0] adcs @acc[2],@acc[2],@mod[2] mul @mod[2],@a[2],@a[1] adcs @acc[3],@acc[3],@mod[3] mul @mod[3],@a[3],@a[1] adcs @acc[4],@acc[4],@mod[4] mul @mod[4],@a[4],@a[1] adc @acc[5],xzr, @mod[5] mul @mod[5],@a[5],@a[1] adds @acc[2],@acc[2],@mod[2] umulh @mod[2],@a[2],@a[1] adcs @acc[3],@acc[3],@mod[3] umulh @mod[3],@a[3],@a[1] adcs @acc[4],@acc[4],@mod[4] umulh @mod[4],@a[4],@a[1] adcs @acc[5],@acc[5],@mod[5] umulh @mod[5],@a[5],@a[1] adc @acc[6],xzr,xzr mul @mod[0],@a[0],@a[0] adds @acc[3],@acc[3],@mod[2] umulh @a[0], @a[0],@a[0] adcs @acc[4],@acc[4],@mod[3] mul @mod[3],@a[3],@a[2] adcs @acc[5],@acc[5],@mod[4] mul @mod[4],@a[4],@a[2] adc @acc[6],@acc[6],@mod[5] mul @mod[5],@a[5],@a[2] adds @acc[4],@acc[4],@mod[3] umulh @mod[3],@a[3],@a[2] adcs @acc[5],@acc[5],@mod[4] umulh @mod[4],@a[4],@a[2] adcs @acc[6],@acc[6],@mod[5] umulh @mod[5],@a[5],@a[2] adc @acc[7],xzr,xzr mul @mod[1],@a[1],@a[1] adds @acc[5],@acc[5],@mod[3] umulh @a[1], @a[1],@a[1] adcs @acc[6],@acc[6],@mod[4] mul @mod[4],@a[4],@a[3] adc @acc[7],@acc[7],@mod[5] mul @mod[5],@a[5],@a[3] adds @acc[6],@acc[6],@mod[4] umulh @mod[4],@a[4],@a[3] adcs @acc[7],@acc[7],@mod[5] umulh @mod[5],@a[5],@a[3] adc @acc[8],xzr,xzr mul @mod[2],@a[2],@a[2] adds @acc[7],@acc[7],@mod[4] umulh @a[2], @a[2],@a[2] adc @acc[8],@acc[8],@mod[5] mul @mod[3],@a[3],@a[3] mul @mod[5],@a[5],@a[4] umulh @a[3], @a[3],@a[3] adds @acc[8],@acc[8],@mod[5] umulh @mod[5],@a[5],@a[4] mul @mod[4],@a[4],@a[4] adc @acc[9],@mod[5],xzr adds @acc[0],@acc[0],@acc[0] adcs @acc[1],@acc[1],@acc[1] adcs @acc[2],@acc[2],@acc[2] adcs @acc[3],@acc[3],@acc[3] adcs @acc[4],@acc[4],@acc[4] adcs @acc[5],@acc[5],@acc[5] adcs @acc[6],@acc[6],@acc[6] adcs @acc[7],@acc[7],@acc[7] umulh @a[4], @a[4],@a[4] adcs @acc[8],@acc[8],@acc[8] mul @mod[5],@a[5],@a[5] adcs @acc[9],@acc[9],@acc[9] umulh @a[5], @a[5],@a[5] adc $a_ptr,xzr,xzr adds @acc[0],@acc[0],@a[0] adcs @acc[1],@acc[1],@mod[1] adcs @acc[2],@acc[2],@a[1] adcs @acc[3],@acc[3],@mod[2] adcs @acc[4],@acc[4],@a[2] adcs @acc[5],@acc[5],@mod[3] adcs @acc[6],@acc[6],@a[3] stp @mod[0],@acc[0],[$r_ptr] adcs @acc[7],@acc[7],@mod[4] stp @acc[1],@acc[2],[$r_ptr,#16] adcs @acc[8],@acc[8],@a[4] stp @acc[3],@acc[4],[$r_ptr,#32] adcs @acc[9],@acc[9],@mod[5] stp @acc[5],@acc[6],[$r_ptr,#48] adc @a[5],@a[5],$a_ptr stp @acc[7],@acc[8],[$r_ptr,#64] stp @acc[9],@a[5],[$r_ptr,#80] ret .size __sqr_384,.-__sqr_384 ___ } $code.=<<___; .globl sqr_384 .hidden sqr_384 .type sqr_384,%function .align 5 sqr_384: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] bl __sqr_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sqr_384,.-sqr_384 .globl redc_mont_384 .hidden redc_mont_384 .type redc_mont_384,%function .align 5 redc_mont_384: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov $n0,$n_ptr // adjust for missing b_ptr ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size redc_mont_384,.-redc_mont_384 .globl from_mont_384 .hidden from_mont_384 .type from_mont_384,%function .align 5 from_mont_384: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov $n0,$n_ptr // adjust for missing b_ptr ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __mul_by_1_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] subs @acc[0],@a[0],@mod[0] sbcs @acc[1],@a[1],@mod[1] sbcs @acc[2],@a[2],@mod[2] sbcs @acc[3],@a[3],@mod[3] sbcs @acc[4],@a[4],@mod[4] sbcs @acc[5],@a[5],@mod[5] csel @a[0],@a[0],@acc[0],lo csel @a[1],@a[1],@acc[1],lo csel @a[2],@a[2],@acc[2],lo csel @a[3],@a[3],@acc[3],lo csel @a[4],@a[4],@acc[4],lo csel @a[5],@a[5],@acc[5],lo stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size from_mont_384,.-from_mont_384 .type __mul_by_1_mont_384,%function .align 5 __mul_by_1_mont_384: ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] mul @tmp[0],$n0,@a[0] ldp @a[4],@a[5],[$a_ptr,#32] // mul @acc[0],@mod[0],@tmp[0] mul @acc[1],@mod[1],@tmp[0] mul @acc[2],@mod[2],@tmp[0] mul @acc[3],@mod[3],@tmp[0] mul @acc[4],@mod[4],@tmp[0] mul @acc[5],@mod[5],@tmp[0] subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] umulh @a[0],@mod[0],@tmp[0] adcs @acc[1],@acc[1],@a[1] umulh @a[1],@mod[1],@tmp[0] adcs @acc[2],@acc[2],@a[2] umulh @a[2],@mod[2],@tmp[0] adcs @acc[3],@acc[3],@a[3] umulh @a[3],@mod[3],@tmp[0] adcs @acc[4],@acc[4],@a[4] umulh @a[4],@mod[4],@tmp[0] adcs @acc[5],@acc[5],@a[5] umulh @a[5],@mod[5],@tmp[0] adc @acc[6],xzr,xzr ___ for ($i=1;$i<6;$i++) { $code.=<<___; adds @a[0],@a[0],@acc[1] adcs @a[1],@a[1],@acc[2] adcs @a[2],@a[2],@acc[3] mul @tmp[0],$n0,@a[0] adcs @a[3],@a[3],@acc[4] adcs @a[4],@a[4],@acc[5] adc @a[5],@a[5],@acc[6] // mul @acc[0],@mod[0],@tmp[0] mul @acc[1],@mod[1],@tmp[0] mul @acc[2],@mod[2],@tmp[0] mul @acc[3],@mod[3],@tmp[0] mul @acc[4],@mod[4],@tmp[0] mul @acc[5],@mod[5],@tmp[0] subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] umulh @a[0],@mod[0],@tmp[0] adcs @acc[1],@acc[1],@a[1] umulh @a[1],@mod[1],@tmp[0] adcs @acc[2],@acc[2],@a[2] umulh @a[2],@mod[2],@tmp[0] adcs @acc[3],@acc[3],@a[3] umulh @a[3],@mod[3],@tmp[0] adcs @acc[4],@acc[4],@a[4] umulh @a[4],@mod[4],@tmp[0] adcs @acc[5],@acc[5],@a[5] umulh @a[5],@mod[5],@tmp[0] adc @acc[6],xzr,xzr ___ } $code.=<<___; adds @a[0],@a[0],@acc[1] adcs @a[1],@a[1],@acc[2] adcs @a[2],@a[2],@acc[3] adcs @a[3],@a[3],@acc[4] adcs @a[4],@a[4],@acc[5] adc @a[5],@a[5],@acc[6] ret .size __mul_by_1_mont_384,.-__mul_by_1_mont_384 .type __redc_tail_mont_384,%function .align 5 __redc_tail_mont_384: ldp @acc[0],@acc[1],[$a_ptr,#48] ldp @acc[2],@acc[3],[$a_ptr,#64] ldp @acc[4],@acc[5],[$a_ptr,#80] adds @a[0],@a[0],@acc[0] // accumulate upper half adcs @a[1],@a[1],@acc[1] adcs @a[2],@a[2],@acc[2] adcs @a[3],@a[3],@acc[3] adcs @a[4],@a[4],@acc[4] adcs @a[5],@a[5],@acc[5] adc @acc[6],xzr,xzr subs @acc[0],@a[0],@mod[0] sbcs @acc[1],@a[1],@mod[1] sbcs @acc[2],@a[2],@mod[2] sbcs @acc[3],@a[3],@mod[3] sbcs @acc[4],@a[4],@mod[4] sbcs @acc[5],@a[5],@mod[5] sbcs xzr,@acc[6],xzr csel @a[0],@a[0],@acc[0],lo csel @a[1],@a[1],@acc[1],lo csel @a[2],@a[2],@acc[2],lo csel @a[3],@a[3],@acc[3],lo csel @a[4],@a[4],@acc[4],lo csel @a[5],@a[5],@acc[5],lo stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ret .size __redc_tail_mont_384,.-__redc_tail_mont_384 .globl mul_384 .hidden mul_384 .type mul_384,%function .align 5 mul_384: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] bl __mul_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size mul_384,.-mul_384 .type __mul_384,%function .align 5 __mul_384: ldp @a[0],@a[1],[$a_ptr] ldr $bi, [$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] mul @acc[0],@a[0],$bi mul @acc[1],@a[1],$bi mul @acc[2],@a[2],$bi mul @acc[3],@a[3],$bi mul @acc[4],@a[4],$bi mul @acc[5],@a[5],$bi umulh @mod[0],@a[0],$bi umulh @mod[1],@a[1],$bi umulh @mod[2],@a[2],$bi umulh @mod[3],@a[3],$bi umulh @mod[4],@a[4],$bi umulh @mod[5],@a[5],$bi ldr $bi,[$b_ptr,8*1] str @acc[0],[$r_ptr] adds @acc[0],@acc[1],@mod[0] mul @mod[0],@a[0],$bi adcs @acc[1],@acc[2],@mod[1] mul @mod[1],@a[1],$bi adcs @acc[2],@acc[3],@mod[2] mul @mod[2],@a[2],$bi adcs @acc[3],@acc[4],@mod[3] mul @mod[3],@a[3],$bi adcs @acc[4],@acc[5],@mod[4] mul @mod[4],@a[4],$bi adc @acc[5],xzr, @mod[5] mul @mod[5],@a[5],$bi ___ for ($i=1;$i<5;$i++) { $code.=<<___; adds @acc[0],@acc[0],@mod[0] umulh @mod[0],@a[0],$bi adcs @acc[1],@acc[1],@mod[1] umulh @mod[1],@a[1],$bi adcs @acc[2],@acc[2],@mod[2] umulh @mod[2],@a[2],$bi adcs @acc[3],@acc[3],@mod[3] umulh @mod[3],@a[3],$bi adcs @acc[4],@acc[4],@mod[4] umulh @mod[4],@a[4],$bi adcs @acc[5],@acc[5],@mod[5] umulh @mod[5],@a[5],$bi ldr $bi,[$b_ptr,#8*($i+1)] adc @acc[6],xzr,xzr str @acc[0],[$r_ptr,8*$i] adds @acc[0],@acc[1],@mod[0] mul @mod[0],@a[0],$bi adcs @acc[1],@acc[2],@mod[1] mul @mod[1],@a[1],$bi adcs @acc[2],@acc[3],@mod[2] mul @mod[2],@a[2],$bi adcs @acc[3],@acc[4],@mod[3] mul @mod[3],@a[3],$bi adcs @acc[4],@acc[5],@mod[4] mul @mod[4],@a[4],$bi adc @acc[5],@acc[6],@mod[5] mul @mod[5],@a[5],$bi ___ } $code.=<<___; adds @acc[0],@acc[0],@mod[0] umulh @mod[0],@a[0],$bi adcs @acc[1],@acc[1],@mod[1] umulh @mod[1],@a[1],$bi adcs @acc[2],@acc[2],@mod[2] umulh @mod[2],@a[2],$bi adcs @acc[3],@acc[3],@mod[3] umulh @mod[3],@a[3],$bi adcs @acc[4],@acc[4],@mod[4] umulh @mod[4],@a[4],$bi adcs @acc[5],@acc[5],@mod[5] umulh @mod[5],@a[5],$bi adc @acc[6],xzr,xzr str @acc[0],[$r_ptr,8*$i] adds @acc[0],@acc[1],@mod[0] adcs @acc[1],@acc[2],@mod[1] adcs @acc[2],@acc[3],@mod[2] adcs @acc[3],@acc[4],@mod[3] adcs @acc[4],@acc[5],@mod[4] adc @acc[5],@acc[6],@mod[5] stp @acc[0],@acc[1],[$r_ptr,#48] stp @acc[2],@acc[3],[$r_ptr,#64] stp @acc[4],@acc[5],[$r_ptr,#80] ret .size __mul_384,.-__mul_384 .globl mul_382x .hidden mul_382x .type mul_382x,%function .align 5 mul_382x: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] sub csp,csp,#96 // space for two 384-bit vectors ldp @a[0],@a[1],[$a_ptr] cmov @tmp[0],$r_ptr // save r_ptr ldp @acc[0],@acc[1],[$a_ptr,#48] cmov @tmp[1],$a_ptr // save a_ptr ldp @a[2],@a[3],[$a_ptr,#16] cmov @tmp[2],$b_ptr // save b_ptr ldp @acc[2],@acc[3],[$a_ptr,#64] ldp @a[4],@a[5],[$a_ptr,#32] adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im ldp @acc[4],@acc[5],[$a_ptr,#80] adcs @mod[1],$a[1],@acc[1] ldp @a[0],@a[1],[$b_ptr] adcs @mod[2],$a[2],@acc[2] ldp @acc[0],@acc[1],[$b_ptr,#48] adcs @mod[3],$a[3],@acc[3] ldp @a[2],@a[3],[$b_ptr,#16] adcs @mod[4],$a[4],@acc[4] ldp @acc[2],@acc[3],[$b_ptr,#64] adc @mod[5],$a[5],@acc[5] ldp @a[4],@a[5],[$b_ptr,#32] stp @mod[0],@mod[1],[sp] adds @mod[0],$a[0],@acc[0] // t1 = b->re + b->im ldp @acc[4],@acc[5],[$b_ptr,#80] adcs @mod[1],$a[1],@acc[1] stp @mod[2],@mod[3],[sp,#16] adcs @mod[2],$a[2],@acc[2] adcs @mod[3],$a[3],@acc[3] stp @mod[4],@mod[5],[sp,#32] adcs @mod[4],$a[4],@acc[4] stp @mod[0],@mod[1],[sp,#48] adc @mod[5],$a[5],@acc[5] stp @mod[2],@mod[3],[sp,#64] stp @mod[4],@mod[5],[sp,#80] bl __mul_384 // mul_384(ret->re, a->re, b->re) cadd $a_ptr,sp,#0 // mul_384(ret->im, t0, t1) cadd $b_ptr,sp,#48 cadd $r_ptr,@tmp[0],#96 bl __mul_384 cadd $a_ptr,@tmp[1],#48 // mul_384(tx, a->im, b->im) cadd $b_ptr,@tmp[2],#48 cadd $r_ptr,sp,#0 bl __mul_384 ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] cadd $a_ptr,@tmp[0],#96 // ret->im -= tx cadd $b_ptr,sp,#0 cadd $r_ptr,@tmp[0],#96 bl __sub_mod_384x384 cadd $b_ptr,@tmp[0],#0 // ret->im -= ret->re bl __sub_mod_384x384 cadd $a_ptr,@tmp[0],#0 // ret->re -= tx cadd $b_ptr,sp,#0 cadd $r_ptr,@tmp[0],#0 bl __sub_mod_384x384 ldr c30,[c29,#__SIZEOF_POINTER__] add csp,csp,#96 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size mul_382x,.-mul_382x .globl sqr_382x .hidden sqr_382x .type sqr_382x,%function .align 5 sqr_382x: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] ldp @a[0],@a[1],[$a_ptr] ldp @acc[0],@acc[1],[$a_ptr,#48] ldp @a[2],@a[3],[$a_ptr,#16] adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im ldp @acc[2],@acc[3],[$a_ptr,#64] adcs @mod[1],$a[1],@acc[1] ldp @a[4],@a[5],[$a_ptr,#32] adcs @mod[2],$a[2],@acc[2] ldp @acc[4],@acc[5],[$a_ptr,#80] adcs @mod[3],$a[3],@acc[3] stp @mod[0],@mod[1],[$r_ptr] adcs @mod[4],$a[4],@acc[4] ldp @mod[0],@mod[1],[$b_ptr] adc @mod[5],$a[5],@acc[5] stp @mod[2],@mod[3],[$r_ptr,#16] subs @a[0],$a[0],@acc[0] // t1 = a->re - a->im ldp @mod[2],@mod[3],[$b_ptr,#16] sbcs @a[1],$a[1],@acc[1] stp @mod[4],@mod[5],[$r_ptr,#32] sbcs @a[2],$a[2],@acc[2] ldp @mod[4],@mod[5],[$b_ptr,#32] sbcs @a[3],$a[3],@acc[3] sbcs @a[4],$a[4],@acc[4] sbcs @a[5],$a[5],@acc[5] sbc @acc[6],xzr,xzr and @acc[0],@mod[0],@acc[6] and @acc[1],@mod[1],@acc[6] adds @a[0],@a[0],@acc[0] and @acc[2],@mod[2],@acc[6] adcs @a[1],@a[1],@acc[1] and @acc[3],@mod[3],@acc[6] adcs @a[2],@a[2],@acc[2] and @acc[4],@mod[4],@acc[6] adcs @a[3],@a[3],@acc[3] and @acc[5],@mod[5],@acc[6] adcs @a[4],@a[4],@acc[4] stp @a[0],@a[1],[$r_ptr,#48] adc @a[5],@a[5],@acc[5] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] cmov $n0,$a_ptr // save a_ptr cadd $a_ptr,$r_ptr,#0 // mul_384(ret->re, t0, t1) cadd $b_ptr,$r_ptr,#48 bl __mul_384 cadd $a_ptr,$n0,#0 // mul_384(ret->im, a->re, a->im) cadd $b_ptr,$n0,#48 cadd $r_ptr,$r_ptr,#96 bl __mul_384 ldr c30,[c29,#__SIZEOF_POINTER__] ldp @a[0],@a[1],[$r_ptr] ldp @a[2],@a[3],[$r_ptr,#16] adds @a[0],@a[0],@a[0] // add with itself ldp @a[4],@a[5],[$r_ptr,#32] adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adcs @acc[0],@acc[0],@acc[0] adcs @acc[1],@acc[1],@acc[1] stp @a[0],@a[1],[$r_ptr] adcs @acc[2],@acc[2],@acc[2] stp @a[2],@a[3],[$r_ptr,#16] adcs @acc[3],@acc[3],@acc[3] stp @a[4],@a[5],[$r_ptr,#32] adcs @acc[4],@acc[4],@acc[4] stp @acc[0],@acc[1],[$r_ptr,#48] adc @acc[5],@acc[5],@acc[5] stp @acc[2],@acc[3],[$r_ptr,#64] stp @acc[4],@acc[5],[$r_ptr,#80] ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sqr_382x,.-sqr_382x .globl sqr_mont_382x .hidden sqr_mont_382x .type sqr_mont_382x,%function .align 5 sqr_mont_382x: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] stp c3,c0,[csp,#12*__SIZEOF_POINTER__] // __mul_mont_384 wants them there sub csp,csp,#112 // space for two 384-bit vectors + word mov $n0,$n_ptr // adjust for missing b_ptr ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp $bi,@acc[1],[$a_ptr,#48] ldp @acc[2],@acc[3],[$a_ptr,#64] ldp @acc[4],@acc[5],[$a_ptr,#80] adds @mod[0],$a[0],$bi // t0 = a->re + a->im adcs @mod[1],$a[1],@acc[1] adcs @mod[2],$a[2],@acc[2] adcs @mod[3],$a[3],@acc[3] adcs @mod[4],$a[4],@acc[4] adc @mod[5],$a[5],@acc[5] subs @acc[0],$a[0],$bi // t1 = a->re - a->im sbcs @acc[1],$a[1],@acc[1] sbcs @acc[2],$a[2],@acc[2] sbcs @acc[3],$a[3],@acc[3] sbcs @acc[4],$a[4],@acc[4] sbcs @acc[5],$a[5],@acc[5] sbc @acc[6],xzr,xzr // borrow flag as mask stp @mod[0],@mod[1],[sp] stp @mod[2],@mod[3],[sp,#16] stp @mod[4],@mod[5],[sp,#32] stp @acc[0],@acc[1],[sp,#48] stp @acc[2],@acc[3],[sp,#64] stp @acc[4],@acc[5],[sp,#80] str @acc[6],[sp,#96] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] cadd $b_ptr,$a_ptr,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) adds @acc[0],@a[0],@a[0] // add with itself adcs @acc[1],@a[1],@a[1] adcs @acc[2],@a[2],@a[2] adcs @acc[3],@a[3],@a[3] adcs @acc[4],@a[4],@a[4] adc @acc[5],@a[5],@a[5] stp @acc[0],@acc[1],[$b_ptr,#48] stp @acc[2],@acc[3],[$b_ptr,#64] stp @acc[4],@acc[5],[$b_ptr,#80] ldp @a[0],@a[1],[sp] ldr $bi,[sp,#48] ldp @a[2],@a[3],[sp,#16] ldp @a[4],@a[5],[sp,#32] cadd $b_ptr,sp,#48 bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) ldr c30,[c29,#__SIZEOF_POINTER__] ldr @acc[6],[sp,#96] // account for sign from a->re - a->im ldp @acc[0],@acc[1],[sp] ldp @acc[2],@acc[3],[sp,#16] ldp @acc[4],@acc[5],[sp,#32] and @acc[0],@acc[0],@acc[6] and @acc[1],@acc[1],@acc[6] and @acc[2],@acc[2],@acc[6] and @acc[3],@acc[3],@acc[6] and @acc[4],@acc[4],@acc[6] and @acc[5],@acc[5],@acc[6] subs @a[0],@a[0],@acc[0] sbcs @a[1],@a[1],@acc[1] sbcs @a[2],@a[2],@acc[2] sbcs @a[3],@a[3],@acc[3] sbcs @a[4],@a[4],@acc[4] sbcs @a[5],@a[5],@acc[5] sbc @acc[6],xzr,xzr and @acc[0],@mod[0],@acc[6] and @acc[1],@mod[1],@acc[6] and @acc[2],@mod[2],@acc[6] and @acc[3],@mod[3],@acc[6] and @acc[4],@mod[4],@acc[6] and @acc[5],@mod[5],@acc[6] adds @a[0],@a[0],@acc[0] adcs @a[1],@a[1],@acc[1] adcs @a[2],@a[2],@acc[2] adcs @a[3],@a[3],@acc[3] adcs @a[4],@a[4],@acc[4] adc @a[5],@a[5],@acc[5] stp @a[0],@a[1],[$b_ptr] stp @a[2],@a[3],[$b_ptr,#16] stp @a[4],@a[5],[$b_ptr,#32] add csp,csp,#112 ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sqr_mont_382x,.-sqr_mont_382x .type __mul_mont_383_nonred,%function .align 5 __mul_mont_383_nonred: mul @acc[0],@a[0],$bi mul @acc[1],@a[1],$bi mul @acc[2],@a[2],$bi mul @acc[3],@a[3],$bi mul @acc[4],@a[4],$bi mul @acc[5],@a[5],$bi mul $n0,$n0,@acc[0] umulh @tmp[0],@a[0],$bi umulh @tmp[1],@a[1],$bi umulh @tmp[2],@a[2],$bi umulh @tmp[3],@a[3],$bi umulh @tmp[4],@a[4],$bi umulh @tmp[5],@a[5],$bi adds @acc[1],@acc[1],@tmp[0] mul @tmp[0],@mod[0],$n0 adcs @acc[2],@acc[2],@tmp[1] mul @tmp[1],@mod[1],$n0 adcs @acc[3],@acc[3],@tmp[2] mul @tmp[2],@mod[2],$n0 adcs @acc[4],@acc[4],@tmp[3] mul @tmp[3],@mod[3],$n0 adcs @acc[5],@acc[5],@tmp[4] mul @tmp[4],@mod[4],$n0 adc @acc[6],xzr, @tmp[5] mul @tmp[5],@mod[5],$n0 ___ for ($i=1;$i<6;$i++) { $code.=<<___; ldr $bi,[$b_ptr,8*$i] adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@mod[0],$n0 adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@mod[1],$n0 adcs @acc[2],@acc[2],@tmp[2] umulh @tmp[2],@mod[2],$n0 adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@mod[3],$n0 adcs @acc[4],@acc[4],@tmp[4] umulh @tmp[4],@mod[4],$n0 adcs @acc[5],@acc[5],@tmp[5] umulh @tmp[5],@mod[5],$n0 adc @acc[6],@acc[6],xzr ldr $n0,[x29,#12*__SIZEOF_POINTER__] adds @acc[0],@acc[1],@tmp[0] mul @tmp[0],@a[0],$bi adcs @acc[1],@acc[2],@tmp[1] mul @tmp[1],@a[1],$bi adcs @acc[2],@acc[3],@tmp[2] mul @tmp[2],@a[2],$bi adcs @acc[3],@acc[4],@tmp[3] mul @tmp[3],@a[3],$bi adcs @acc[4],@acc[5],@tmp[4] mul @tmp[4],@a[4],$bi adcs @acc[5],@acc[6],@tmp[5] mul @tmp[5],@a[5],$bi adc @acc[6],xzr,xzr adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@a[0],$bi adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@a[1],$bi adcs @acc[2],@acc[2],@tmp[2] mul $n0,$n0,@acc[0] umulh @tmp[2],@a[2],$bi adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@a[3],$bi adcs @acc[4],@acc[4],@tmp[4] umulh @tmp[4],@a[4],$bi adcs @acc[5],@acc[5],@tmp[5] umulh @tmp[5],@a[5],$bi adc @acc[6],@acc[6],xzr adds @acc[1],@acc[1],@tmp[0] mul @tmp[0],@mod[0],$n0 adcs @acc[2],@acc[2],@tmp[1] mul @tmp[1],@mod[1],$n0 adcs @acc[3],@acc[3],@tmp[2] mul @tmp[2],@mod[2],$n0 adcs @acc[4],@acc[4],@tmp[3] mul @tmp[3],@mod[3],$n0 adcs @acc[5],@acc[5],@tmp[4] mul @tmp[4],@mod[4],$n0 adc @acc[6],@acc[6],@tmp[5] mul @tmp[5],@mod[5],$n0 ___ } $code.=<<___; adds @acc[0],@acc[0],@tmp[0] umulh @tmp[0],@mod[0],$n0 adcs @acc[1],@acc[1],@tmp[1] umulh @tmp[1],@mod[1],$n0 adcs @acc[2],@acc[2],@tmp[2] umulh @tmp[2],@mod[2],$n0 adcs @acc[3],@acc[3],@tmp[3] umulh @tmp[3],@mod[3],$n0 adcs @acc[4],@acc[4],@tmp[4] umulh @tmp[4],@mod[4],$n0 adcs @acc[5],@acc[5],@tmp[5] umulh @tmp[5],@mod[5],$n0 adc @acc[6],@acc[6],xzr ldp c4,c2,[c29,#12*__SIZEOF_POINTER__] // pull r_ptr adds @a[0],@acc[1],@tmp[0] adcs @a[1],@acc[2],@tmp[1] adcs @a[2],@acc[3],@tmp[2] adcs @a[3],@acc[4],@tmp[3] adcs @a[4],@acc[5],@tmp[4] adcs @a[5],@acc[6],@tmp[5] ret .size __mul_mont_383_nonred,.-__mul_mont_383_nonred .globl sgn0_pty_mont_384 .hidden sgn0_pty_mont_384 .type sgn0_pty_mont_384,%function .align 5 sgn0_pty_mont_384: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov $n0,$b_ptr ldp @mod[0],@mod[1],[$a_ptr] ldp @mod[2],@mod[3],[$a_ptr,#16] ldp @mod[4],@mod[5],[$a_ptr,#32] cmov $a_ptr,$r_ptr bl __mul_by_1_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] and $r_ptr,@a[0],#1 adds @a[0],@a[0],@a[0] adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc $bi,xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc $bi,$bi,xzr mvn $bi,$bi and $bi,$bi,#2 orr $r_ptr,$r_ptr,$bi ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sgn0_pty_mont_384,.-sgn0_pty_mont_384 .globl sgn0_pty_mont_384x .hidden sgn0_pty_mont_384x .type sgn0_pty_mont_384x,%function .align 5 sgn0_pty_mont_384x: paciasp stp c29,c30,[csp,#-16*__SIZEOF_POINTER__]! add c29,csp,#0 stp c19,c20,[csp,#2*__SIZEOF_POINTER__] stp c21,c22,[csp,#4*__SIZEOF_POINTER__] stp c23,c24,[csp,#6*__SIZEOF_POINTER__] stp c25,c26,[csp,#8*__SIZEOF_POINTER__] stp c27,c28,[csp,#10*__SIZEOF_POINTER__] mov $n0,$b_ptr ldp @mod[0],@mod[1],[$a_ptr] ldp @mod[2],@mod[3],[$a_ptr,#16] ldp @mod[4],@mod[5],[$a_ptr,#32] cmov $a_ptr,$r_ptr bl __mul_by_1_mont_384 cadd $a_ptr,$a_ptr,#48 and $b_ptr,@a[0],#1 orr $n_ptr,@a[0],@a[1] adds @a[0],@a[0],@a[0] orr $n_ptr,$n_ptr,@a[2] adcs @a[1],@a[1],@a[1] orr $n_ptr,$n_ptr,@a[3] adcs @a[2],@a[2],@a[2] orr $n_ptr,$n_ptr,@a[4] adcs @a[3],@a[3],@a[3] orr $n_ptr,$n_ptr,@a[5] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc $bi,xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc $bi,$bi,xzr mvn $bi,$bi and $bi,$bi,#2 orr $b_ptr,$b_ptr,$bi bl __mul_by_1_mont_384 ldr c30,[c29,#__SIZEOF_POINTER__] and $r_ptr,@a[0],#1 orr $a_ptr,@a[0],@a[1] adds @a[0],@a[0],@a[0] orr $a_ptr,$a_ptr,@a[2] adcs @a[1],@a[1],@a[1] orr $a_ptr,$a_ptr,@a[3] adcs @a[2],@a[2],@a[2] orr $a_ptr,$a_ptr,@a[4] adcs @a[3],@a[3],@a[3] orr $a_ptr,$a_ptr,@a[5] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc $bi,xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc $bi,$bi,xzr mvn $bi,$bi and $bi,$bi,#2 orr $r_ptr,$r_ptr,$bi cmp $n_ptr,#0 csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) cmp $a_ptr,#0 csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and $n_ptr,$n_ptr,#1 and $a_ptr,$a_ptr,#2 orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity ldp c19,c20,[c29,#2*__SIZEOF_POINTER__] ldp c21,c22,[c29,#4*__SIZEOF_POINTER__] ldp c23,c24,[c29,#6*__SIZEOF_POINTER__] ldp c25,c26,[c29,#8*__SIZEOF_POINTER__] ldp c27,c28,[c29,#10*__SIZEOF_POINTER__] ldr c29,[csp],#16*__SIZEOF_POINTER__ autiasp ret .size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x ___ if (0) { my @b = ($bi, @mod[0..4]); my @comba = @acc[4..6]; $code.=<<___; .type __mul_384_comba,%function .align 5 __mul_384_comba: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @b[2],@b[3],[$b_ptr,#16] ldp @b[4],@b[5],[$b_ptr,#32] mul @comba[0],@a[0],@b[0] umulh @comba[1],@a[0],@b[0] mul @acc[0],@a[1],@b[0] umulh @acc[1],@a[1],@b[0] str @comba[0],[$r_ptr] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[2],@a[0],@b[1] umulh @acc[3],@a[0],@b[1] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],xzr, @acc[1] adc @comba[2],xzr,xzr mul @acc[0],@a[2],@b[0] umulh @acc[1],@a[2],@b[0] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#8] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[2],@a[1],@b[1] umulh @acc[3],@a[1],@b[1] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],xzr,xzr mul @acc[0],@a[0],@b[2] umulh @acc[1],@a[0],@b[2] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[3],@b[0] umulh @acc[3],@a[3],@b[0] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#16] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[0],@a[2],@b[1] umulh @acc[1],@a[2],@b[1] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],xzr,xzr mul @acc[2],@a[1],@b[2] umulh @acc[3],@a[1],@b[2] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[0],@b[3] umulh @acc[1],@a[0],@b[3] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[4],@b[0] umulh @acc[3],@a[4],@b[0] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#24] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[0],@a[3],@b[1] umulh @acc[1],@a[3],@b[1] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],xzr,xzr mul @acc[2],@a[2],@b[2] umulh @acc[3],@a[2],@b[2] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[1],@b[3] umulh @acc[1],@a[1],@b[3] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[0],@b[4] umulh @acc[3],@a[0],@b[4] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[5],@b[0] umulh @acc[1],@a[5],@b[0] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#32] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[2],@a[4],@b[1] umulh @acc[3],@a[4],@b[1] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],xzr,xzr mul @acc[0],@a[3],@b[2] umulh @acc[1],@a[3],@b[2] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[2],@b[3] umulh @acc[3],@a[2],@b[3] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[1],@b[4] umulh @acc[1],@a[1],@b[4] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[0],@b[5] umulh @acc[3],@a[0],@b[5] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[5],@b[1] umulh @acc[1],@a[5],@b[1] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#40] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[2],@a[4],@b[2] umulh @acc[3],@a[4],@b[2] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],xzr,xzr mul @acc[0],@a[3],@b[3] umulh @acc[1],@a[3],@b[3] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[2],@b[4] umulh @acc[3],@a[2],@b[4] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[1],@b[5] umulh @acc[1],@a[1],@b[5] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[5],@b[2] umulh @acc[3],@a[5],@b[2] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#48] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[0],@a[4],@b[3] umulh @acc[1],@a[4],@b[3] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],xzr,xzr mul @acc[2],@a[3],@b[4] umulh @acc[3],@a[3],@b[4] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[2],@b[5] umulh @acc[1],@a[2],@b[5] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr mul @acc[2],@a[5],@b[3] umulh @acc[3],@a[5],@b[3] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#56] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[0],@a[4],@b[4] umulh @acc[1],@a[4],@b[4] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],xzr,xzr mul @acc[2],@a[3],@b[5] umulh @acc[3],@a[3],@b[5] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],@comba[2],xzr mul @acc[0],@a[5],@b[4] umulh @acc[1],@a[5],@b[4] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#64] ___ push(@comba,shift(@comba)); $code.=<<___; mul @acc[2],@a[4],@b[5] umulh @acc[3],@a[4],@b[5] adds @comba[0],@comba[0],@acc[0] adcs @comba[1],@comba[1],@acc[1] adc @comba[2],xzr,xzr mul @acc[0],@a[5],@b[5] umulh @acc[1],@a[5],@b[5] adds @comba[0],@comba[0],@acc[2] adcs @comba[1],@comba[1],@acc[3] adc @comba[2],@comba[2],xzr str @comba[0],[$r_ptr,#72] ___ push(@comba,shift(@comba)); $code.=<<___; adds @comba[0],@comba[0],@acc[0] adc @comba[1],@comba[1],@acc[1] stp @comba[0],@comba[1],[$r_ptr,#80] ret .size __mul_384_comba,.-__mul_384_comba ___ } print $code; close STDOUT; ================================================ FILE: src/asm/mulq_mont_256-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # As for "sparse" in subroutine names, see commentary in the # asm/mulx_mont_256-x86_64.pl module. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; $code.=<<___ if ($flavour =~ /masm/); .extern mul_mont_sparse_256\$1 .extern sqr_mont_sparse_256\$1 .extern from_mont_256\$1 .extern redc_mont_256\$1 ___ # common argument layout ($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); $b_ptr = "%rbx"; { ############################################################## 256 bits my @acc=map("%r$_",(9..15)); { ############################################################## mulq my ($hi, $a0) = ("%rbp", $r_ptr); $code.=<<___; .comm __blst_platform_cap,4 .text .globl mul_mont_sparse_256 .hidden mul_mont_sparse_256 .type mul_mont_sparse_256,\@function,5,"unwind" .align 32 mul_mont_sparse_256: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz mul_mont_sparse_256\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $r_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov 8*0($b_org), %rax mov 8*0($a_ptr), @acc[4] mov 8*1($a_ptr), @acc[5] mov 8*2($a_ptr), @acc[3] mov 8*3($a_ptr), $hi mov $b_org, $b_ptr # evacuate from %rdx mov %rax, @acc[6] mulq @acc[4] # a[0]*b[0] mov %rax, @acc[0] mov @acc[6], %rax mov %rdx, @acc[1] call __mulq_mont_sparse_256 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size mul_mont_sparse_256,.-mul_mont_sparse_256 .globl sqr_mont_sparse_256 .hidden sqr_mont_sparse_256 .type sqr_mont_sparse_256,\@function,4,"unwind" .align 32 sqr_mont_sparse_256: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_mont_sparse_256\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $r_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov 8*0($a_ptr), %rax mov $n_ptr, $n0 mov 8*1($a_ptr), @acc[5] mov $b_org, $n_ptr mov 8*2($a_ptr), @acc[3] lea ($a_ptr), $b_ptr mov 8*3($a_ptr), $hi mov %rax, @acc[6] mulq %rax # a[0]*a[0] mov %rax, @acc[0] mov @acc[6], %rax mov %rdx, @acc[1] call __mulq_mont_sparse_256 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sqr_mont_sparse_256,.-sqr_mont_sparse_256 ___ { my @acc=@acc; $code.=<<___; .type __mulq_mont_sparse_256,\@abi-omnipotent .align 32 __mulq_mont_sparse_256: mulq @acc[5] # a[1]*b[0] add %rax, @acc[1] mov @acc[6], %rax adc \$0, %rdx mov %rdx, @acc[2] mulq @acc[3] # a[2]*b[0] add %rax, @acc[2] mov @acc[6], %rax adc \$0, %rdx mov %rdx, @acc[3] mulq $hi # a[3]*b[0] add %rax, @acc[3] mov 8($b_ptr), %rax adc \$0, %rdx xor @acc[5], @acc[5] mov %rdx, @acc[4] ___ for (my $i=1; $i<4; $i++) { my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1]; $code.=<<___; mov @acc[0], $a0 imulq $n0, @acc[0] ################################# Multiply by b[$i] mov %rax, @acc[6] mulq 8*0($a_ptr) add %rax, @acc[1] mov @acc[6], %rax adc \$0, %rdx mov %rdx, $hi mulq 8*1($a_ptr) add %rax, @acc[2] mov @acc[6], %rax adc \$0, %rdx add $hi, @acc[2] adc \$0, %rdx mov %rdx, $hi mulq 8*2($a_ptr) add %rax, @acc[3] mov @acc[6], %rax adc \$0, %rdx add $hi, @acc[3] adc \$0, %rdx mov %rdx, $hi mulq 8*3($a_ptr) add %rax, @acc[4] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[4] adc %rdx, @acc[5] # can't overflow xor @acc[6], @acc[6] ################################# reduction mulq 8*0($n_ptr) add %rax, $a0 # guaranteed to be zero mov @acc[0], %rax adc %rdx, $a0 mulq 8*1($n_ptr) add %rax, @acc[1] mov @acc[0], %rax adc \$0, %rdx add $a0, @acc[1] adc \$0, %rdx mov %rdx, $hi mulq 8*2($n_ptr) add %rax, @acc[2] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[2] adc \$0, %rdx mov %rdx, $hi mulq 8*3($n_ptr) add %rax, @acc[3] mov $b_next, %rax adc \$0, %rdx add $hi, @acc[3] adc \$0, %rdx add %rdx, @acc[4] adc \$0, @acc[5] adc \$0, @acc[6] ___ push(@acc,shift(@acc)); } $code.=<<___; imulq $n0, %rax mov 8(%rsp), $a_ptr # restore $r_ptr ################################# last reduction mov %rax, @acc[6] mulq 8*0($n_ptr) add %rax, @acc[0] # guaranteed to be zero mov @acc[6], %rax adc %rdx, @acc[0] mulq 8*1($n_ptr) add %rax, @acc[1] mov @acc[6], %rax adc \$0, %rdx add @acc[0], @acc[1] adc \$0, %rdx mov %rdx, $hi mulq 8*2($n_ptr) add %rax, @acc[2] mov @acc[6], %rax adc \$0, %rdx add $hi, @acc[2] adc \$0, %rdx mov %rdx, $hi mulq 8*3($n_ptr) mov @acc[2], $b_ptr add $hi, @acc[3] adc \$0, %rdx add %rax, @acc[3] mov @acc[1], %rax adc \$0, %rdx add %rdx, @acc[4] adc \$0, @acc[5] ################################# # Branch-less conditional subtraction of modulus mov @acc[3], @acc[0] sub 8*0($n_ptr), @acc[1] sbb 8*1($n_ptr), @acc[2] sbb 8*2($n_ptr), @acc[3] mov @acc[4], $hi sbb 8*3($n_ptr), @acc[4] sbb \$0, @acc[5] cmovc %rax, @acc[1] cmovc $b_ptr, @acc[2] cmovc @acc[0], @acc[3] mov @acc[1], 8*0($a_ptr) cmovc $hi, @acc[4] mov @acc[2], 8*1($a_ptr) mov @acc[3], 8*2($a_ptr) mov @acc[4], 8*3($a_ptr) ret .cfi_endproc .size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 ___ } } { my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" $code.=<<___; .globl from_mont_256 .hidden from_mont_256 .type from_mont_256,\@function,4,"unwind" .align 32 from_mont_256: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz from_mont_256\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr call __mulq_by_1_mont_256 ################################# # Branch-less conditional acc[0:3] - modulus #mov @acc[4], %rax # __mulq_by_1_mont_256 does it mov @acc[5], @acc[1] mov @acc[6], @acc[2] mov @acc[0], @acc[3] sub 8*0($n_ptr), @acc[4] sbb 8*1($n_ptr), @acc[5] sbb 8*2($n_ptr), @acc[6] sbb 8*3($n_ptr), @acc[0] cmovnc @acc[4], %rax cmovnc @acc[5], @acc[1] cmovnc @acc[6], @acc[2] mov %rax, 8*0($r_ptr) cmovnc @acc[0], @acc[3] mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size from_mont_256,.-from_mont_256 .globl redc_mont_256 .hidden redc_mont_256 .type redc_mont_256,\@function,4,"unwind" .align 32 redc_mont_256: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz redc_mont_256\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr call __mulq_by_1_mont_256 add 8*4($a_ptr), @acc[4] # accumulate upper half adc 8*5($a_ptr), @acc[5] mov @acc[4], %rax adc 8*6($a_ptr), @acc[6] mov @acc[5], @acc[1] adc 8*7($a_ptr), @acc[0] sbb $a_ptr, $a_ptr ################################# # Branch-less conditional acc[0:4] - modulus mov @acc[6], @acc[2] sub 8*0($n_ptr), @acc[4] sbb 8*1($n_ptr), @acc[5] sbb 8*2($n_ptr), @acc[6] mov @acc[0], @acc[3] sbb 8*3($n_ptr), @acc[0] sbb \$0, $a_ptr cmovnc @acc[4], %rax cmovnc @acc[5], @acc[1] cmovnc @acc[6], @acc[2] mov %rax, 8*0($r_ptr) cmovnc @acc[0], @acc[3] mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size redc_mont_256,.-redc_mont_256 ___ { my @acc=@acc; $code.=<<___; .type __mulq_by_1_mont_256,\@abi-omnipotent .align 32 __mulq_by_1_mont_256: mov 8*0($a_ptr), %rax mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov %rax, @acc[4] imulq $n0, %rax mov %rax, @acc[0] ___ for (my $i=0; $i<4; $i++) { my $hi = @acc[4]; $code.=<<___; ################################# reduction $i mulq 8*0($n_ptr) add %rax, @acc[4] # guaranteed to be zero mov @acc[0], %rax adc %rdx, @acc[4] mulq 8*1($n_ptr) add %rax, @acc[1] mov @acc[0], %rax adc \$0, %rdx add @acc[4], @acc[1] adc \$0, %rdx mov %rdx, $hi mulq 8*2($n_ptr) ___ $code.=<<___ if ($i<3); mov @acc[1], @acc[5] imulq $n0, @acc[1] ___ $code.=<<___; add %rax, @acc[2] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[2] adc \$0, %rdx mov %rdx, $hi mulq 8*3($n_ptr) add %rax, @acc[3] mov @acc[1], %rax adc \$0, %rdx add $hi, @acc[3] adc \$0, %rdx mov %rdx, @acc[4] ___ push(@acc,shift(@acc)); } $code.=<<___; ret .size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 ___ } } } print $code; close STDOUT; ================================================ FILE: src/asm/mulq_mont_384-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; $code.=<<___ if ($flavour =~ /masm/); .extern mul_mont_384x\$1 .extern sqr_mont_384x\$1 .extern mul_382x\$1 .extern sqr_382x\$1 .extern mul_384\$1 .extern sqr_384\$1 .extern redc_mont_384\$1 .extern from_mont_384\$1 .extern sgn0_pty_mont_384\$1 .extern sgn0_pty_mont_384x\$1 .extern mul_mont_384\$1 .extern sqr_mont_384\$1 .extern sqr_n_mul_mont_384\$1 .extern sqr_n_mul_mont_383\$1 .extern sqr_mont_382x\$1 ___ # common argument layout ($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); $b_ptr = "%rbx"; # common accumulator layout @acc=map("%r$_",(8..15)); ######################################################################## { my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected # except for $n_ptr and $r_ptr $code.=<<___; .comm __blst_platform_cap,4 .text ######################################################################## # Double-width subtraction modulo n<<384, as opposite to naively # expected modulo n*n. It works because n<<384 is the actual # input boundary condition for Montgomery reduction, not n*n. # Just in case, this is duplicated, but only one module is # supposed to be linked... .type __subq_mod_384x384,\@abi-omnipotent .align 32 __subq_mod_384x384: mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov 8*6($a_ptr), @acc[6] sub 8*0($b_org), @acc[0] mov 8*7($a_ptr), @acc[7] sbb 8*1($b_org), @acc[1] mov 8*8($a_ptr), @acc[8] sbb 8*2($b_org), @acc[2] mov 8*9($a_ptr), @acc[9] sbb 8*3($b_org), @acc[3] mov 8*10($a_ptr), @acc[10] sbb 8*4($b_org), @acc[4] mov 8*11($a_ptr), @acc[11] sbb 8*5($b_org), @acc[5] mov @acc[0], 8*0($r_ptr) sbb 8*6($b_org), @acc[6] mov 8*0($n_ptr), @acc[0] mov @acc[1], 8*1($r_ptr) sbb 8*7($b_org), @acc[7] mov 8*1($n_ptr), @acc[1] mov @acc[2], 8*2($r_ptr) sbb 8*8($b_org), @acc[8] mov 8*2($n_ptr), @acc[2] mov @acc[3], 8*3($r_ptr) sbb 8*9($b_org), @acc[9] mov 8*3($n_ptr), @acc[3] mov @acc[4], 8*4($r_ptr) sbb 8*10($b_org), @acc[10] mov 8*4($n_ptr), @acc[4] mov @acc[5], 8*5($r_ptr) sbb 8*11($b_org), @acc[11] mov 8*5($n_ptr), @acc[5] sbb $b_org, $b_org and $b_org, @acc[0] and $b_org, @acc[1] and $b_org, @acc[2] and $b_org, @acc[3] and $b_org, @acc[4] and $b_org, @acc[5] add @acc[0], @acc[6] adc @acc[1], @acc[7] mov @acc[6], 8*6($r_ptr) adc @acc[2], @acc[8] mov @acc[7], 8*7($r_ptr) adc @acc[3], @acc[9] mov @acc[8], 8*8($r_ptr) adc @acc[4], @acc[10] mov @acc[9], 8*9($r_ptr) adc @acc[5], @acc[11] mov @acc[10], 8*10($r_ptr) mov @acc[11], 8*11($r_ptr) ret .size __subq_mod_384x384,.-__subq_mod_384x384 .type __addq_mod_384,\@abi-omnipotent .align 32 __addq_mod_384: mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] add 8*0($b_org), @acc[0] adc 8*1($b_org), @acc[1] adc 8*2($b_org), @acc[2] mov @acc[0], @acc[6] adc 8*3($b_org), @acc[3] mov @acc[1], @acc[7] adc 8*4($b_org), @acc[4] mov @acc[2], @acc[8] adc 8*5($b_org), @acc[5] mov @acc[3], @acc[9] sbb $b_org, $b_org sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[10] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[11] sbb 8*5($n_ptr), @acc[5] sbb \$0, $b_org cmovc @acc[6], @acc[0] cmovc @acc[7], @acc[1] cmovc @acc[8], @acc[2] mov @acc[0], 8*0($r_ptr) cmovc @acc[9], @acc[3] mov @acc[1], 8*1($r_ptr) cmovc @acc[10], @acc[4] mov @acc[2], 8*2($r_ptr) cmovc @acc[11], @acc[5] mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ret .size __addq_mod_384,.-__addq_mod_384 .type __subq_mod_384,\@abi-omnipotent .align 32 __subq_mod_384: mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] __subq_mod_384_a_is_loaded: sub 8*0($b_org), @acc[0] mov 8*0($n_ptr), @acc[6] sbb 8*1($b_org), @acc[1] mov 8*1($n_ptr), @acc[7] sbb 8*2($b_org), @acc[2] mov 8*2($n_ptr), @acc[8] sbb 8*3($b_org), @acc[3] mov 8*3($n_ptr), @acc[9] sbb 8*4($b_org), @acc[4] mov 8*4($n_ptr), @acc[10] sbb 8*5($b_org), @acc[5] mov 8*5($n_ptr), @acc[11] sbb $b_org, $b_org and $b_org, @acc[6] and $b_org, @acc[7] and $b_org, @acc[8] and $b_org, @acc[9] and $b_org, @acc[10] and $b_org, @acc[11] add @acc[6], @acc[0] adc @acc[7], @acc[1] mov @acc[0], 8*0($r_ptr) adc @acc[8], @acc[2] mov @acc[1], 8*1($r_ptr) adc @acc[9], @acc[3] mov @acc[2], 8*2($r_ptr) adc @acc[10], @acc[4] mov @acc[3], 8*3($r_ptr) adc @acc[11], @acc[5] mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ret .size __subq_mod_384,.-__subq_mod_384 ___ } ######################################################################## # "Complex" multiplication and squaring. Use vanilla multiplication when # possible to fold reductions. I.e. instead of mul_mont, mul_mont # followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod # followed by *common* reduction... { my $frame = 5*8 + # place for argument off-load + 3*768/8; # place for 3 768-bit temporary vectors $code.=<<___; .globl mul_mont_384x .hidden mul_mont_384x .type mul_mont_384x,\@function,5,"unwind" .align 32 mul_mont_384x: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz mul_mont_384x\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue mov $b_org, $b_ptr mov $r_ptr, 8*4(%rsp) # offload arguments mov $a_ptr, 8*3(%rsp) mov $b_org, 8*2(%rsp) mov $n_ptr, 8*1(%rsp) mov $n0, 8*0(%rsp) ################################# mul_384(t0, a->re, b->re); #lea 0($b_btr), $b_ptr # b->re #lea 0($a_ptr), $a_ptr # a->re lea 40(%rsp), $r_ptr # t0 call __mulq_384 ################################# mul_384(t1, a->im, b->im); lea 48($b_ptr), $b_ptr # b->im lea 48($a_ptr), $a_ptr # a->im lea 40+96(%rsp), $r_ptr # t1 call __mulq_384 ################################# mul_384(t2, a->re+a->im, b->re+b->im); mov 8*1(%rsp), $n_ptr lea -48($a_ptr), $b_org lea 40+192+48(%rsp), $r_ptr call __addq_mod_384 mov 8*2(%rsp), $a_ptr lea 48($a_ptr), $b_org lea -48($r_ptr), $r_ptr call __addq_mod_384 lea ($r_ptr),$b_ptr lea 48($r_ptr),$a_ptr call __mulq_384 ################################# t2=t2-t0-t1 lea ($r_ptr), $a_ptr # t2 lea 40(%rsp), $b_org # t0 mov 8*1(%rsp), $n_ptr call __subq_mod_384x384 # t2=t2-t0 lea ($r_ptr), $a_ptr # t2 lea -96($r_ptr), $b_org # t1 call __subq_mod_384x384 # t2=t2-t1 ################################# t0=t0-t1 lea 40(%rsp), $a_ptr lea 40+96(%rsp), $b_org lea 40(%rsp), $r_ptr call __subq_mod_384x384 # t0-t1 mov $n_ptr, $b_ptr # n_ptr for redc_mont_384 ################################# redc_mont_384(ret->re, t0, mod, n0); lea 40(%rsp), $a_ptr # t0 mov 8*0(%rsp), %rcx # n0 for redc_mont_384 mov 8*4(%rsp), $r_ptr # ret->re call __mulq_by_1_mont_384 call __redq_tail_mont_384 ################################# redc_mont_384(ret->im, t2, mod, n0); lea 40+192(%rsp), $a_ptr # t2 mov 8*0(%rsp), %rcx # n0 for redc_mont_384 lea 48($r_ptr), $r_ptr # ret->im call __mulq_by_1_mont_384 call __redq_tail_mont_384 lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size mul_mont_384x,.-mul_mont_384x ___ } { my $frame = 4*8 + # place for argument off-load + 2*384/8 + # place for 2 384-bit temporary vectors 8; # align $code.=<<___; .globl sqr_mont_384x .hidden sqr_mont_384x .type sqr_mont_384x,\@function,4,"unwind" .align 32 sqr_mont_384x: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_mont_384x\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue mov $n_ptr, 8*0(%rsp) # n0 mov $b_org, $n_ptr # n_ptr mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 mov $a_ptr, 8*2(%rsp) ################################# add_mod_384(t0, a->re, a->im); lea 48($a_ptr), $b_org # a->im lea 32(%rsp), $r_ptr # t0 call __addq_mod_384 ################################# sub_mod_384(t1, a->re, a->im); mov 8*2(%rsp), $a_ptr # a->re lea 48($a_ptr), $b_org # a->im lea 32+48(%rsp), $r_ptr # t1 call __subq_mod_384 ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); mov 8*2(%rsp), $a_ptr # a->re lea 48($a_ptr), $b_ptr # a->im mov 48($a_ptr), %rax # a->im mov 8*0($a_ptr), @acc[6] # a->re mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[4] mov 8*3($a_ptr), @acc[5] call __mulq_mont_384 ___ { my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 12,13,"ax","bx","bp","si"); $code.=<<___; add @acc[0], @acc[0] # add with itself adc @acc[1], @acc[1] adc @acc[2], @acc[2] mov @acc[0], @acc[6] adc @acc[3], @acc[3] mov @acc[1], @acc[7] adc @acc[4], @acc[4] mov @acc[2], @acc[8] adc @acc[5], @acc[5] mov @acc[3], @acc[9] sbb $b_org, $b_org sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[10] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[11] sbb 8*5($n_ptr), @acc[5] sbb \$0, $b_org cmovc @acc[6], @acc[0] cmovc @acc[7], @acc[1] cmovc @acc[8], @acc[2] mov @acc[0], 8*6($r_ptr) # ret->im cmovc @acc[9], @acc[3] mov @acc[1], 8*7($r_ptr) cmovc @acc[10], @acc[4] mov @acc[2], 8*8($r_ptr) cmovc @acc[11], @acc[5] mov @acc[3], 8*9($r_ptr) mov @acc[4], 8*10($r_ptr) mov @acc[5], 8*11($r_ptr) ___ } $code.=<<___; ################################# mul_mont_384(ret->re, t0, t1, mod, n0); lea 32(%rsp), $a_ptr # t0 lea 32+48(%rsp), $b_ptr # t1 mov 32+48(%rsp), %rax # t1[0] mov 32+8*0(%rsp), @acc[6] # t0[0..3] mov 32+8*1(%rsp), @acc[7] mov 32+8*2(%rsp), @acc[4] mov 32+8*3(%rsp), @acc[5] call __mulq_mont_384 lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size sqr_mont_384x,.-sqr_mont_384x .globl mul_382x .hidden mul_382x .type mul_382x,\@function,4,"unwind" .align 32 mul_382x: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz mul_382x\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue lea 96($r_ptr), $r_ptr # ret->im mov $a_ptr, 8*0(%rsp) mov $b_org, 8*1(%rsp) mov $r_ptr, 8*2(%rsp) # offload ret->im mov $n_ptr, 8*3(%rsp) ################################# t0 = a->re + a->im mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] add 8*6($a_ptr), @acc[0] adc 8*7($a_ptr), @acc[1] adc 8*8($a_ptr), @acc[2] adc 8*9($a_ptr), @acc[3] adc 8*10($a_ptr), @acc[4] adc 8*11($a_ptr), @acc[5] mov @acc[0], 32+8*0(%rsp) mov @acc[1], 32+8*1(%rsp) mov @acc[2], 32+8*2(%rsp) mov @acc[3], 32+8*3(%rsp) mov @acc[4], 32+8*4(%rsp) mov @acc[5], 32+8*5(%rsp) ################################# t1 = b->re + b->im mov 8*0($b_org), @acc[0] mov 8*1($b_org), @acc[1] mov 8*2($b_org), @acc[2] mov 8*3($b_org), @acc[3] mov 8*4($b_org), @acc[4] mov 8*5($b_org), @acc[5] add 8*6($b_org), @acc[0] adc 8*7($b_org), @acc[1] adc 8*8($b_org), @acc[2] adc 8*9($b_org), @acc[3] adc 8*10($b_org), @acc[4] adc 8*11($b_org), @acc[5] mov @acc[0], 32+8*6(%rsp) mov @acc[1], 32+8*7(%rsp) mov @acc[2], 32+8*8(%rsp) mov @acc[3], 32+8*9(%rsp) mov @acc[4], 32+8*10(%rsp) mov @acc[5], 32+8*11(%rsp) ################################# mul_384(ret->im, t0, t1); lea 32+8*0(%rsp), $a_ptr # t0 lea 32+8*6(%rsp), $b_ptr # t1 call __mulq_384 ################################# mul_384(ret->re, a->re, b->re); mov 8*0(%rsp), $a_ptr mov 8*1(%rsp), $b_ptr lea -96($r_ptr), $r_ptr # ret->re call __mulq_384 ################################# mul_384(tx, a->im, b->im); lea 48($a_ptr), $a_ptr lea 48($b_ptr), $b_ptr lea 32(%rsp), $r_ptr call __mulq_384 ################################# ret->im -= tx mov 8*2(%rsp), $a_ptr # restore ret->im lea 32(%rsp), $b_org mov 8*3(%rsp), $n_ptr mov $a_ptr, $r_ptr call __subq_mod_384x384 ################################# ret->im -= ret->re lea 0($r_ptr), $a_ptr lea -96($r_ptr), $b_org call __subq_mod_384x384 ################################# ret->re -= tx lea -96($r_ptr), $a_ptr lea 32(%rsp), $b_org lea -96($r_ptr), $r_ptr call __subq_mod_384x384 lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size mul_382x,.-mul_382x ___ } { my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected # except for $n_ptr and $r_ptr $code.=<<___; .globl sqr_382x .hidden sqr_382x .type sqr_382x,\@function,3,"unwind" .align 32 sqr_382x: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_382x\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $a_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr ################################# t0 = a->re + a->im mov 8*0($a_ptr), @acc[6] mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov 8*3($a_ptr), @acc[9] mov 8*4($a_ptr), @acc[10] mov 8*5($a_ptr), @acc[11] mov @acc[6], @acc[0] add 8*6($a_ptr), @acc[6] mov @acc[7], @acc[1] adc 8*7($a_ptr), @acc[7] mov @acc[8], @acc[2] adc 8*8($a_ptr), @acc[8] mov @acc[9], @acc[3] adc 8*9($a_ptr), @acc[9] mov @acc[10], @acc[4] adc 8*10($a_ptr), @acc[10] mov @acc[11], @acc[5] adc 8*11($a_ptr), @acc[11] mov @acc[6], 8*0($r_ptr) mov @acc[7], 8*1($r_ptr) mov @acc[8], 8*2($r_ptr) mov @acc[9], 8*3($r_ptr) mov @acc[10], 8*4($r_ptr) mov @acc[11], 8*5($r_ptr) ################################# t1 = a->re - a->im lea 48($a_ptr), $b_org lea 48($r_ptr), $r_ptr call __subq_mod_384_a_is_loaded ################################# mul_384(ret->re, t0, t1); lea ($r_ptr), $a_ptr lea -48($r_ptr), $b_ptr lea -48($r_ptr), $r_ptr call __mulq_384 ################################# mul_384(ret->im, a->re, a->im); mov (%rsp), $a_ptr lea 48($a_ptr), $b_ptr lea 96($r_ptr), $r_ptr call __mulq_384 mov 8*0($r_ptr), @acc[0] # double ret->im mov 8*1($r_ptr), @acc[1] mov 8*2($r_ptr), @acc[2] mov 8*3($r_ptr), @acc[3] mov 8*4($r_ptr), @acc[4] mov 8*5($r_ptr), @acc[5] mov 8*6($r_ptr), @acc[6] mov 8*7($r_ptr), @acc[7] mov 8*8($r_ptr), @acc[8] mov 8*9($r_ptr), @acc[9] mov 8*10($r_ptr), @acc[10] add @acc[0], @acc[0] mov 8*11($r_ptr), @acc[11] adc @acc[1], @acc[1] mov @acc[0], 8*0($r_ptr) adc @acc[2], @acc[2] mov @acc[1], 8*1($r_ptr) adc @acc[3], @acc[3] mov @acc[2], 8*2($r_ptr) adc @acc[4], @acc[4] mov @acc[3], 8*3($r_ptr) adc @acc[5], @acc[5] mov @acc[4], 8*4($r_ptr) adc @acc[6], @acc[6] mov @acc[5], 8*5($r_ptr) adc @acc[7], @acc[7] mov @acc[6], 8*6($r_ptr) adc @acc[8], @acc[8] mov @acc[7], 8*7($r_ptr) adc @acc[9], @acc[9] mov @acc[8], 8*8($r_ptr) adc @acc[10], @acc[10] mov @acc[9], 8*9($r_ptr) adc @acc[11], @acc[11] mov @acc[10], 8*10($r_ptr) mov @acc[11], 8*11($r_ptr) mov 8*1(%rsp),%r15 .cfi_restore %r15 mov 8*2(%rsp),%r14 .cfi_restore %r14 mov 8*3(%rsp),%r13 .cfi_restore %r13 mov 8*4(%rsp),%r12 .cfi_restore %r12 mov 8*5(%rsp),%rbx .cfi_restore %rbx mov 8*6(%rsp),%rbp .cfi_restore %rbp lea 8*7(%rsp),%rsp .cfi_adjust_cfa_offset -8*7 .cfi_epilogue ret .cfi_endproc .size sqr_382x,.-sqr_382x ___ } { ########################################################## 384-bit mul my @acc=map("%r$_",("cx",8..12)); my $bi = "%rbp"; $code.=<<___; .globl mul_384 .hidden mul_384 .type mul_384,\@function,3,"unwind" .align 32 mul_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz mul_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 .cfi_end_prologue mov $b_org, $b_ptr call __mulq_384 mov 0(%rsp),%r12 .cfi_restore %r12 mov 8(%rsp),%rbx .cfi_restore %rbx mov 16(%rsp),%rbp .cfi_restore %rbp lea 24(%rsp),%rsp .cfi_adjust_cfa_offset -24 .cfi_epilogue ret .cfi_endproc .size mul_384,.-mul_384 .type __mulq_384,\@abi-omnipotent .align 32 __mulq_384: mov 8*0($b_ptr), %rax mov %rax, $bi mulq 8*0($a_ptr) mov %rax, 8*0($r_ptr) mov $bi, %rax mov %rdx, @acc[0] mulq 8*1($a_ptr) add %rax, @acc[0] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[1] mulq 8*2($a_ptr) add %rax, @acc[1] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[2] mulq 8*3($a_ptr) add %rax, @acc[2] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[3] mulq 8*4($a_ptr) add %rax, @acc[3] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[4] mulq 8*5($a_ptr) add %rax, @acc[4] mov 8*1($b_ptr), %rax adc \$0, %rdx mov %rdx, @acc[5] ___ for(my $i=1; $i<6; $i++) { my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; $code.=<<___; mov %rax, $bi mulq 8*0($a_ptr) add %rax, @acc[0] mov $bi, %rax adc \$0, %rdx mov @acc[0], 8*$i($r_ptr) mov %rdx, @acc[0] mulq 8*1($a_ptr) add %rax, @acc[1] mov $bi, %rax adc \$0, %rdx add @acc[1], @acc[0] adc \$0, %rdx mov %rdx, @acc[1] mulq 8*2($a_ptr) add %rax, @acc[2] mov $bi, %rax adc \$0, %rdx add @acc[2], @acc[1] adc \$0, %rdx mov %rdx, @acc[2] mulq 8*3($a_ptr) add %rax, @acc[3] mov $bi, %rax adc \$0, %rdx add @acc[3], @acc[2] adc \$0, %rdx mov %rdx, @acc[3] mulq 8*4($a_ptr) add %rax, @acc[4] mov $bi, %rax adc \$0, %rdx add @acc[4], @acc[3] adc \$0, %rdx mov %rdx, @acc[4] mulq 8*5($a_ptr) add %rax, @acc[5] mov $b_next, %rax adc \$0, %rdx add @acc[5], @acc[4] adc \$0, %rdx mov %rdx, @acc[5] ___ } $code.=<<___; mov @acc[0], 8*6($r_ptr) mov @acc[1], 8*7($r_ptr) mov @acc[2], 8*8($r_ptr) mov @acc[3], 8*9($r_ptr) mov @acc[4], 8*10($r_ptr) mov @acc[5], 8*11($r_ptr) ret .size __mulq_384,.-__mulq_384 ___ } if (0) { ############################################################## my @b=map("%r$_",(10..15)); my @a=reverse(@b); @b[5]=$b_ptr; my $bi = "%rbp"; my @comba=map("%r$_",("cx",8,9)); # a[0]*b[0] # a[1]*b[0] # a[0]*b[1] # a[2]*b[0] # a[1]*b[1] # a[0]*b[2] # a[3]*b[0] # a[2]*b[1] # a[1]*b[2] # a[0]*b[3] # a[4]*b[0] # a[3]*b[1] # a[2]*b[2] # a[1]*b[3] # a[0]*b[4] # a[5]*b[0] # a[4]*b[1] # a[3]*b[2] # a[2]*b[3] # a[1]*b[4] # a[0]*b[5] # a[5]*b[1] # a[4]*b[2] # a[3]*b[3] # a[2]*b[4] # a[1]*b[5] # a[5]*b[2] # a[4]*b[3] # a[3]*b[4] # a[2]*b[5] # a[5]*b[3] # a[4]*b[4] # a[3]*b[5] # a[5]*b[4] # a[4]*b[5] # a[5]*b[5] # # 13% less instructions give +15% on Core2, +10% on Goldmont, # -0% on Sandy Bridge, but -16% on Haswell:-( # [for reference +5% on Skylake, +11% on Ryzen] $code.=<<___; .type __mulq_comba_384,\@abi-omnipotent .align 32 __mulq_comba_384: mov 8*0($b_ptr), %rax mov 8*0($a_ptr), @a[0] mov 8*1($a_ptr), @a[1] mov 8*1($b_ptr), @b[1] mov %rax, @b[0] mulq @a[0] # a[0]*b[0] mov %rax, 8*0($r_ptr) mov @b[0], %rax mov %rdx, @comba[0] ################################# mov 8*2($a_ptr), @a[2] xor @comba[2], @comba[2] mulq @a[1] # a[1]*b[0] add %rax, @comba[0] mov @b[1], %rax adc \$0, %rdx mov 8*2($b_ptr), @b[2] mov %rdx, @comba[1] mulq @a[0] # a[0]*b[1] add %rax, @comba[0] mov @b[0], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov @comba[0], 8*1($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq @a[2] # a[2]*b[0] add %rax, @comba[0] mov @b[1], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[1] # a[1]*b[1] add %rax, @comba[0] mov @b[2], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[0] # a[0]*b[2] add %rax, @comba[0] mov @b[0], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov @comba[0], 8*2($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq 8*3($a_ptr) # a[3]*b[0] add %rax, @comba[0] mov @b[1], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[2] # a[2]*b[1] add %rax, @comba[0] mov @b[2], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[1] # a[1]*b[2] add %rax, @comba[0] mov 8*3($b_ptr), %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov %rax, @b[3] mulq @a[0] # a[0]*b[3] add %rax, @comba[0] mov @b[0], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov @comba[0], 8*3($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq 8*4($a_ptr) # a[4]*b[0] add %rax, @comba[0] mov @b[1], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*3($a_ptr) # a[3]*b[1] add %rax, @comba[0] mov @b[2], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*2($a_ptr) # a[2]*b[2] add %rax, @comba[0] mov @b[3], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[1] # a[1]*b[3] add %rax, @comba[0] mov 8*4($b_ptr), %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov %rax, @b[4] mulq @a[0] # a[0]*b[4] add %rax, @comba[0] mov @b[0], %rax adc %rdx, @comba[1] mov 8*5($a_ptr), @a[5] adc \$0, @comba[2] mov @comba[0], 8*4($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq @a[5] # a[5]*b[0] add %rax, @comba[0] mov @b[1], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*4($a_ptr) # a[4]*b[1] add %rax, @comba[0] mov @b[2], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*3($a_ptr) # a[3]*b[2] add %rax, @comba[0] mov @b[3], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*2($a_ptr) # a[2]*b[3] add %rax, @comba[0] mov @b[4], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*1($a_ptr) # a[1]*b[4] add %rax, @comba[0] mov 8*5($b_ptr), %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov %rax, @b[5] mulq @a[0] # a[0]*b[5] add %rax, @comba[0] mov @b[1], %rax adc %rdx, @comba[1] mov 8*4($a_ptr), @a[4] adc \$0, @comba[2] mov @comba[0], 8*5($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq @a[5] # a[5]*b[1] add %rax, @comba[0] mov @b[2], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[4] # a[4]*b[2] add %rax, @comba[0] mov @b[3], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*3($a_ptr) # a[3]*b[3] add %rax, @comba[0] mov @b[4], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*2($a_ptr) # a[2]*b[4] add %rax, @comba[0] mov @b[5], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*1($a_ptr) # a[1]*b[5] add %rax, @comba[0] mov $b[2], %rax adc %rdx, @comba[1] mov 8*3($a_ptr), @a[3] adc \$0, @comba[2] mov @comba[0], 8*6($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq @a[5] # a[5]*b[2] add %rax, @comba[0] mov @b[3], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[4] # a[4]*b[3] add %rax, @comba[0] mov @b[4], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[3] # a[3]*b[4] add %rax, @comba[0] mov @b[5], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq 8*2($a_ptr) # a[2]*b[5] add %rax, @comba[0] mov @b[3], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov @comba[0], 8*7($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq @a[5] # a[5]*b[3] add %rax, @comba[0] mov @b[4], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[4] # a[4]*b[4] add %rax, @comba[0] mov @b[5], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[3] # a[3]*b[5] add %rax, @comba[0] mov @b[4], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov @comba[0], 8*8($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; xor @comba[2], @comba[2] mulq @a[5] # a[5]*b[4] add %rax, @comba[0] mov @b[5], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mulq @a[4] # a[4]*b[5] add %rax, @comba[0] mov @b[5], %rax adc %rdx, @comba[1] adc \$0, @comba[2] mov @comba[0], 8*9($r_ptr) ___ push(@comba,shift(@comba)); $code.=<<___; mulq @a[5] # a[5]*b[4] add %rax, @comba[0] adc %rdx, @comba[1] mov @comba[0], 8*10($r_ptr) mov @comba[1], 8*11($r_ptr) ret .size __mulq_comba_384,.-__mulq_comba_384 ___ } { ########################################################## 384-bit sqr my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr); my $hi; $code.=<<___; .globl sqr_384 .hidden sqr_384 .type sqr_384,\@function,2,"unwind" .align 32 sqr_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue call __sqrq_384 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sqr_384,.-sqr_384 .type __sqrq_384,\@abi-omnipotent .align 32 __sqrq_384: mov 8*0($a_ptr), %rax mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov 8*3($a_ptr), @acc[9] ######################################### mov %rax, @acc[6] mulq @acc[7] # a[1]*a[0] mov %rax, @acc[1] mov @acc[6], %rax mov 8*4($a_ptr), @acc[10] mov %rdx, @acc[2] mulq @acc[8] # a[2]*a[0] add %rax, @acc[2] mov @acc[6], %rax adc \$0, %rdx mov 8*5($a_ptr), @acc[11] mov %rdx, @acc[3] mulq @acc[9] # a[3]*a[0] add %rax, @acc[3] mov @acc[6], %rax adc \$0, %rdx mov %rdx, @acc[4] mulq @acc[10] # a[4]*a[0] add %rax, @acc[4] mov @acc[6], %rax adc \$0, %rdx mov %rdx, @acc[5] mulq @acc[11] # a[5]*a[0] add %rax, @acc[5] mov @acc[6], %rax adc \$0, %rdx mov %rdx, @acc[6] mulq %rax # a[0]*a[0] xor @acc[0], @acc[0] mov %rax, 8*0($r_ptr) mov @acc[7], %rax add @acc[1], @acc[1] # double acc[1] adc \$0, @acc[0] add %rdx, @acc[1] # accumulate a[0]*a[0] adc \$0, @acc[0] # carries to a[1]*a[1] mov @acc[1], 8*1($r_ptr) ___ $hi=@acc[1]; $code.=<<___; ######################################### mulq @acc[8] # a[2]*a[1] add %rax, @acc[3] mov @acc[7], %rax adc \$0, %rdx mov %rdx, $hi mulq @acc[9] # a[3]*a[1] add %rax, @acc[4] mov @acc[7], %rax adc \$0, %rdx add $hi, @acc[4] adc \$0, %rdx mov %rdx, $hi mulq @acc[10] # a[4]*a[1] add %rax, @acc[5] mov @acc[7], %rax adc \$0, %rdx add $hi, @acc[5] adc \$0, %rdx mov %rdx, $hi mulq @acc[11] # a[5]*a[1] add %rax, @acc[6] mov @acc[7], %rax adc \$0, %rdx add $hi, @acc[6] adc \$0, %rdx mov %rdx, @acc[7] mulq %rax # a[1]*a[1] xor @acc[1], @acc[1] add %rax, @acc[0] # can't carry mov @acc[8], %rax add @acc[2], @acc[2] # double acc[2:3] adc @acc[3], @acc[3] adc \$0, @acc[1] add @acc[0], @acc[2] # accumulate a[1]*a[1] adc %rdx, @acc[3] adc \$0, @acc[1] # carries to a[2]*a[2] mov @acc[2], 8*2($r_ptr) ___ $hi=@acc[0]; $code.=<<___; ######################################### mulq @acc[9] # a[3]*a[2] add %rax, @acc[5] mov @acc[8], %rax adc \$0, %rdx mov @acc[3], 8*3($r_ptr) mov %rdx, $hi mulq @acc[10] # a[4]*a[2] add %rax, @acc[6] mov @acc[8], %rax adc \$0, %rdx add $hi, @acc[6] adc \$0, %rdx mov %rdx, $hi mulq @acc[11] # a[5]*a[2] add %rax, @acc[7] mov @acc[8], %rax adc \$0, %rdx add $hi, @acc[7] adc \$0, %rdx mov %rdx, @acc[8] mulq %rax # a[2]*a[2] xor @acc[3], @acc[3] add %rax, @acc[1] # can't carry mov @acc[9], %rax add @acc[4], @acc[4] # double acc[4:5] adc @acc[5], @acc[5] adc \$0, @acc[3] add @acc[1], @acc[4] # accumulate a[2]*a[2] adc %rdx, @acc[5] adc \$0, @acc[3] # carries to a[3]*a[3] mov @acc[4], 8*4($r_ptr) ######################################### mulq @acc[10] # a[4]*a[3] add %rax, @acc[7] mov @acc[9], %rax adc \$0, %rdx mov @acc[5], 8*5($r_ptr) mov %rdx, $hi mulq @acc[11] # a[5]*a[3] add %rax, @acc[8] mov @acc[9], %rax adc \$0, %rdx add $hi, @acc[8] adc \$0, %rdx mov %rdx, @acc[9] mulq %rax # a[3]*a[3] xor @acc[4], @acc[4] add %rax, @acc[3] # can't carry mov @acc[10], %rax add @acc[6], @acc[6] # double acc[6:7] adc @acc[7], @acc[7] adc \$0, @acc[4] add @acc[3], @acc[6] # accumulate a[3]*a[3] adc %rdx, @acc[7] mov @acc[6], 8*6($r_ptr) adc \$0, @acc[4] # carries to a[4]*a[4] mov @acc[7], 8*7($r_ptr) ######################################### mulq @acc[11] # a[5]*a[4] add %rax, @acc[9] mov @acc[10], %rax adc \$0, %rdx mov %rdx, @acc[10] mulq %rax # a[4]*a[4] xor @acc[5], @acc[5] add %rax, @acc[4] # can't carry mov @acc[11], %rax add @acc[8], @acc[8] # double acc[8:9] adc @acc[9], @acc[9] adc \$0, @acc[5] add @acc[4], @acc[8] # accumulate a[4]*a[4] adc %rdx, @acc[9] mov @acc[8], 8*8($r_ptr) adc \$0, @acc[5] # carries to a[5]*a[5] mov @acc[9], 8*9($r_ptr) ######################################### mulq %rax # a[5]*a[5] add @acc[5], %rax # can't carry add @acc[10], @acc[10] # double acc[10] adc \$0, %rdx add @acc[10], %rax # accumulate a[5]*a[5] adc \$0, %rdx mov %rax, 8*10($r_ptr) mov %rdx, 8*11($r_ptr) ret .size __sqrq_384,.-__sqrq_384 .globl sqr_mont_384 .hidden sqr_mont_384 .type sqr_mont_384,\@function,4,"unwind" .align 32 sqr_mont_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_mont_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8*15, %rsp .cfi_adjust_cfa_offset 8*15 .cfi_end_prologue mov $n_ptr, 8*12(%rsp) # n0 mov $b_org, 8*13(%rsp) # n_ptr mov $r_ptr, 8*14(%rsp) mov %rsp, $r_ptr call __sqrq_384 lea 0(%rsp), $a_ptr mov 8*12(%rsp), %rcx # n0 for mul_by_1 mov 8*13(%rsp), $b_ptr # n_ptr for mul_by_1 mov 8*14(%rsp), $r_ptr call __mulq_by_1_mont_384 call __redq_tail_mont_384 lea 8*15(%rsp), %r8 # size optimization mov 8*15(%rsp), %r15 .cfi_restore %r15 mov 8*1(%r8), %r14 .cfi_restore %r14 mov 8*2(%r8), %r13 .cfi_restore %r13 mov 8*3(%r8), %r12 .cfi_restore %r12 mov 8*4(%r8), %rbx .cfi_restore %rbx mov 8*5(%r8), %rbp .cfi_restore %rbp lea 8*6(%r8), %rsp .cfi_adjust_cfa_offset -8*21 .cfi_epilogue ret .cfi_endproc .size sqr_mont_384,.-sqr_mont_384 ___ } { ########################################################## 384-bit redc_mont my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" $code.=<<___; ######################################################################## # void redc_mont_384(uint64_t ret[6], const uint64_t a[12], # uint64_t m[6], uint64_t n0); .globl redc_mont_384 .hidden redc_mont_384 .type redc_mont_384,\@function,4,"unwind" .align 32 redc_mont_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz redc_mont_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr call __mulq_by_1_mont_384 call __redq_tail_mont_384 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size redc_mont_384,.-redc_mont_384 ######################################################################## # void from_mont_384(uint64_t ret[6], const uint64_t a[6], # uint64_t m[6], uint64_t n0); .globl from_mont_384 .hidden from_mont_384 .type from_mont_384,\@function,4,"unwind" .align 32 from_mont_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz from_mont_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr call __mulq_by_1_mont_384 ################################# # Branch-less conditional acc[0:6] - modulus #mov @acc[6], %rax # __mulq_by_1_mont_384 does it mov @acc[7], %rcx mov @acc[0], %rdx mov @acc[1], %rbp sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[7] mov @acc[2], @acc[5] sbb 8*2($n_ptr), @acc[0] sbb 8*3($n_ptr), @acc[1] sbb 8*4($n_ptr), @acc[2] mov @acc[3], $a_ptr sbb 8*5($n_ptr), @acc[3] cmovc %rax, @acc[6] cmovc %rcx, @acc[7] cmovc %rdx, @acc[0] mov @acc[6], 8*0($r_ptr) cmovc %rbp, @acc[1] mov @acc[7], 8*1($r_ptr) cmovc @acc[5], @acc[2] mov @acc[0], 8*2($r_ptr) cmovc $a_ptr, @acc[3] mov @acc[1], 8*3($r_ptr) mov @acc[2], 8*4($r_ptr) mov @acc[3], 8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size from_mont_384,.-from_mont_384 ___ { my @acc=@acc; # will be rotated locally $code.=<<___; .type __mulq_by_1_mont_384,\@abi-omnipotent .align 32 __mulq_by_1_mont_384: mov 8*0($a_ptr), %rax mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov %rax, @acc[6] imulq $n0, %rax mov %rax, @acc[0] ___ for (my $i=0; $i<6; $i++) { my $hi = @acc[6]; $code.=<<___; ################################# reduction $i mulq 8*0($n_ptr) add %rax, @acc[6] # guaranteed to be zero mov @acc[0], %rax adc %rdx, @acc[6] mulq 8*1($n_ptr) add %rax, @acc[1] mov @acc[0], %rax adc \$0, %rdx add @acc[6], @acc[1] adc \$0, %rdx mov %rdx, $hi mulq 8*2($n_ptr) add %rax, @acc[2] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[2] adc \$0, %rdx mov %rdx, $hi mulq 8*3($n_ptr) add %rax, @acc[3] mov @acc[0], %rax adc \$0, %rdx ___ $code.=<<___ if ($i<5); mov @acc[1], @acc[7] imulq $n0, @acc[1] ___ $code.=<<___; add $hi, @acc[3] adc \$0, %rdx mov %rdx, $hi mulq 8*4($n_ptr) add %rax, @acc[4] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[4] adc \$0, %rdx mov %rdx, $hi mulq 8*5($n_ptr) add %rax, @acc[5] mov @acc[1], %rax adc \$0, %rdx add $hi, @acc[5] adc \$0, %rdx mov %rdx, @acc[6] ___ push(@acc,shift(@acc)); } $code.=<<___; ret .size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 .type __redq_tail_mont_384,\@abi-omnipotent .align 32 __redq_tail_mont_384: add 8*6($a_ptr), @acc[0] # accumulate upper half mov @acc[0], %rax adc 8*7($a_ptr), @acc[1] adc 8*8($a_ptr), @acc[2] adc 8*9($a_ptr), @acc[3] mov @acc[1], %rcx adc 8*10($a_ptr), @acc[4] adc 8*11($a_ptr), @acc[5] sbb @acc[6], @acc[6] ################################# # Branch-less conditional acc[0:6] - modulus mov @acc[2], %rdx mov @acc[3], %rbp sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[7] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], $a_ptr sbb 8*5($n_ptr), @acc[5] sbb \$0, @acc[6] cmovc %rax, @acc[0] cmovc %rcx, @acc[1] cmovc %rdx, @acc[2] mov @acc[0], 8*0($r_ptr) cmovc %rbp, @acc[3] mov @acc[1], 8*1($r_ptr) cmovc @acc[7], @acc[4] mov @acc[2], 8*2($r_ptr) cmovc $a_ptr, @acc[5] mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ret .size __redq_tail_mont_384,.-__redq_tail_mont_384 .globl sgn0_pty_mont_384 .hidden sgn0_pty_mont_384 .type sgn0_pty_mont_384,\@function,3,"unwind" .align 32 sgn0_pty_mont_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sgn0_pty_mont_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $a_ptr, $n_ptr lea 0($r_ptr), $a_ptr mov $b_org, $n0 call __mulq_by_1_mont_384 xor %rax, %rax mov @acc[0], @acc[7] add @acc[0], @acc[0] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, %rax sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, %rax not %rax # 2*x > p, which means "negative" and \$1, @acc[7] and \$2, %rax or @acc[7], %rax # pack sign and parity mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sgn0_pty_mont_384,.-sgn0_pty_mont_384 .globl sgn0_pty_mont_384x .hidden sgn0_pty_mont_384x .type sgn0_pty_mont_384x,\@function,3,"unwind" .align 32 sgn0_pty_mont_384x: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sgn0_pty_mont_384x\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $a_ptr, $n_ptr lea 48($r_ptr), $a_ptr # sgn0(a->im) mov $b_org, $n0 call __mulq_by_1_mont_384 mov @acc[0], @acc[6] or @acc[1], @acc[0] or @acc[2], @acc[0] or @acc[3], @acc[0] or @acc[4], @acc[0] or @acc[5], @acc[0] lea 0($r_ptr), $a_ptr # sgn0(a->re) xor $r_ptr, $r_ptr mov @acc[6], @acc[7] add @acc[6], @acc[6] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, $r_ptr sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, $r_ptr mov @acc[0], 0(%rsp) # a->im is zero or not not $r_ptr # 2*x > p, which means "negative" and \$1, @acc[7] and \$2, $r_ptr or @acc[7], $r_ptr # pack sign and parity call __mulq_by_1_mont_384 mov @acc[0], @acc[6] or @acc[1], @acc[0] or @acc[2], @acc[0] or @acc[3], @acc[0] or @acc[4], @acc[0] or @acc[5], @acc[0] xor %rax, %rax mov @acc[6], @acc[7] add @acc[6], @acc[6] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, %rax sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, %rax mov 0(%rsp), @acc[6] not %rax # 2*x > p, which means "negative" test @acc[0], @acc[0] cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) test @acc[6], @acc[6] cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) and \$1, @acc[7] and \$2, %rax or @acc[7], %rax # pack sign and parity mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x ___ } } { ########################################################## mulq_mont my ($bi, $hi) = ("%rdi", "%rbp"); $code.=<<___; .globl mul_mont_384 .hidden mul_mont_384 .type mul_mont_384,\@function,5,"unwind" .align 32 mul_mont_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz mul_mont_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8*3, %rsp .cfi_adjust_cfa_offset 8*3 .cfi_end_prologue mov 8*0($b_org), %rax mov 8*0($a_ptr), @acc[6] mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[4] mov 8*3($a_ptr), @acc[5] mov $b_org, $b_ptr # evacuate from %rdx mov $n0, 8*0(%rsp) mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 call __mulq_mont_384 mov 24(%rsp),%r15 .cfi_restore %r15 mov 32(%rsp),%r14 .cfi_restore %r14 mov 40(%rsp),%r13 .cfi_restore %r13 mov 48(%rsp),%r12 .cfi_restore %r12 mov 56(%rsp),%rbx .cfi_restore %rbx mov 64(%rsp),%rbp .cfi_restore %rbp lea 72(%rsp),%rsp .cfi_adjust_cfa_offset -72 .cfi_epilogue ret .cfi_endproc .size mul_mont_384,.-mul_mont_384 ___ { my @acc=@acc; # will be rotated locally $code.=<<___; .type __mulq_mont_384,\@abi-omnipotent .align 32 __mulq_mont_384: mov %rax, $bi mulq @acc[6] # a[0]*b[0] mov %rax, @acc[0] mov $bi, %rax mov %rdx, @acc[1] mulq @acc[7] # a[1]*b[0] add %rax, @acc[1] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[2] mulq @acc[4] # a[2]*b[0] add %rax, @acc[2] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[3] mov @acc[0], $hi imulq 8(%rsp), @acc[0] mulq @acc[5] # a[3]*b[0] add %rax, @acc[3] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[4] mulq 8*4($a_ptr) add %rax, @acc[4] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[5] mulq 8*5($a_ptr) add %rax, @acc[5] mov @acc[0], %rax adc \$0, %rdx xor @acc[7], @acc[7] mov %rdx, @acc[6] ___ for (my $i=0; $i<6;) { my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; $code.=<<___; ################################# reduction $i mulq 8*0($n_ptr) add %rax, $hi # guaranteed to be zero mov @acc[0], %rax adc %rdx, $hi mulq 8*1($n_ptr) add %rax, @acc[1] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[1] adc \$0, %rdx mov %rdx, $hi mulq 8*2($n_ptr) add %rax, @acc[2] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[2] adc \$0, %rdx mov %rdx, $hi mulq 8*3($n_ptr) add $hi, @acc[3] adc \$0, %rdx add %rax, @acc[3] mov @acc[0], %rax adc \$0, %rdx mov %rdx, $hi mulq 8*4($n_ptr) add %rax, @acc[4] mov @acc[0], %rax adc \$0, %rdx add $hi, @acc[4] adc \$0, %rdx mov %rdx, $hi mulq 8*5($n_ptr) add %rax, @acc[5] mov $b_next, %rax adc \$0, %rdx add $hi, @acc[5] adc %rdx, @acc[6] adc \$0, @acc[7] ___ push(@acc,shift(@acc)); $code.=<<___ if ($i++<5); ################################# Multiply by b[$i] mov %rax, $bi mulq 8*0($a_ptr) add %rax, @acc[0] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[7] mulq 8*1($a_ptr) add %rax, @acc[1] mov $bi, %rax adc \$0, %rdx add @acc[7], @acc[1] adc \$0, %rdx mov %rdx, @acc[7] mulq 8*2($a_ptr) add %rax, @acc[2] mov $bi, %rax adc \$0, %rdx add @acc[7], @acc[2] adc \$0, %rdx mov %rdx, @acc[7] mov @acc[0], $hi imulq 8(%rsp), @acc[0] mulq 8*3($a_ptr) add %rax, @acc[3] mov $bi, %rax adc \$0, %rdx add @acc[7], @acc[3] adc \$0, %rdx mov %rdx, @acc[7] mulq 8*4($a_ptr) add %rax, @acc[4] mov $bi, %rax adc \$0, %rdx add @acc[7], @acc[4] adc \$0, %rdx mov %rdx, @acc[7] mulq 8*5($a_ptr) add @acc[7], @acc[5] adc \$0, %rdx xor @acc[7], @acc[7] add %rax, @acc[5] mov @acc[0], %rax adc %rdx, @acc[6] adc \$0, @acc[7] ___ } $code.=<<___; ################################# # Branch-less conditional acc[0:6] - modulus #mov @acc[0], %rax mov 8*2(%rsp), $r_ptr # restore $r_ptr sub 8*0($n_ptr), @acc[0] mov @acc[1], %rdx sbb 8*1($n_ptr), @acc[1] mov @acc[2], $b_ptr sbb 8*2($n_ptr), @acc[2] mov @acc[3], $a_ptr sbb 8*3($n_ptr), @acc[3] mov @acc[4], $hi sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[7] sbb 8*5($n_ptr), @acc[5] sbb \$0, @acc[6] cmovc %rax, @acc[0] cmovc %rdx, @acc[1] cmovc $b_ptr, @acc[2] mov @acc[0], 8*0($r_ptr) cmovc $a_ptr, @acc[3] mov @acc[1], 8*1($r_ptr) cmovc $hi, @acc[4] mov @acc[2], 8*2($r_ptr) cmovc @acc[7], @acc[5] mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ret .size __mulq_mont_384,.-__mulq_mont_384 ___ } } $code.=<<___; .globl sqr_n_mul_mont_384 .hidden sqr_n_mul_mont_384 .type sqr_n_mul_mont_384,\@function,6,"unwind" .align 32 sqr_n_mul_mont_384: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_n_mul_mont_384\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8*17, %rsp .cfi_adjust_cfa_offset 8*17 .cfi_end_prologue mov $n0, 8*0(%rsp) mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 mov $n_ptr, 8*2(%rsp) lea 8*4(%rsp), $r_ptr mov %r9, 8*3(%rsp) # 6th, multiplicand argument movq (%r9), %xmm2 # prefetch b[0] .Loop_sqr_384: movd %edx, %xmm1 # loop counter call __sqrq_384 lea 0($r_ptr), $a_ptr mov 8*0(%rsp), %rcx # n0 for mul_by_1 mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 call __mulq_by_1_mont_384 call __redq_tail_mont_384 movd %xmm1, %edx lea 0($r_ptr), $a_ptr dec %edx jnz .Loop_sqr_384 movq %xmm2, %rax # b[0] mov $b_ptr, $n_ptr mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument #mov 8*0($b_ptr), %rax #mov 8*0($a_ptr), @acc[6] #mov 8*1($a_ptr), @acc[7] #mov 8*2($a_ptr), @acc[4] #mov 8*3($a_ptr), @acc[5] mov @acc[0], @acc[4] mov @acc[1], @acc[5] call __mulq_mont_384 lea 8*17(%rsp), %r8 # size optimization mov 8*17(%rsp), %r15 .cfi_restore %r15 mov 8*1(%r8), %r14 .cfi_restore %r14 mov 8*2(%r8), %r13 .cfi_restore %r13 mov 8*3(%r8), %r12 .cfi_restore %r12 mov 8*4(%r8), %rbx .cfi_restore %rbx mov 8*5(%r8), %rbp .cfi_restore %rbp lea 8*6(%r8), %rsp .cfi_adjust_cfa_offset -8*23 .cfi_epilogue ret .cfi_endproc .size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 .globl sqr_n_mul_mont_383 .hidden sqr_n_mul_mont_383 .type sqr_n_mul_mont_383,\@function,6,"unwind" .align 32 sqr_n_mul_mont_383: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_n_mul_mont_383\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8*17, %rsp .cfi_adjust_cfa_offset 8*17 .cfi_end_prologue mov $n0, 8*0(%rsp) mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 mov $n_ptr, 8*2(%rsp) lea 8*4(%rsp), $r_ptr mov %r9, 8*3(%rsp) # 6th, multiplicand argument movq (%r9), %xmm2 # prefetch b[0] .Loop_sqr_383: movd %edx, %xmm1 # loop counter call __sqrq_384 lea 0($r_ptr), $a_ptr mov 8*0(%rsp), %rcx # n0 for mul_by_1 mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 call __mulq_by_1_mont_384 movd %xmm1, %edx # loop counter add 8*6($a_ptr), @acc[6] # just accumulate upper half adc 8*7($a_ptr), @acc[7] adc 8*8($a_ptr), @acc[0] adc 8*9($a_ptr), @acc[1] adc 8*10($a_ptr), @acc[2] adc 8*11($a_ptr), @acc[3] lea 0($r_ptr), $a_ptr mov @acc[6], 8*0($r_ptr) # omitting full reduction gives ~5% mov @acc[7], 8*1($r_ptr) # in addition-chains mov @acc[0], 8*2($r_ptr) mov @acc[1], 8*3($r_ptr) mov @acc[2], 8*4($r_ptr) mov @acc[3], 8*5($r_ptr) dec %edx jnz .Loop_sqr_383 movq %xmm2, %rax # b[0] mov $b_ptr, $n_ptr mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument #movq 8*0($b_ptr), %rax #mov 8*0($a_ptr), @acc[6] #mov 8*1($a_ptr), @acc[7] #mov 8*2($a_ptr), @acc[4] #mov 8*3($a_ptr), @acc[5] mov @acc[0], @acc[4] mov @acc[1], @acc[5] call __mulq_mont_384 # formally one can omit full reduction # even after multiplication... lea 8*17(%rsp), %r8 # size optimization mov 8*17(%rsp), %r15 .cfi_restore %r15 mov 8*1(%r8), %r14 .cfi_restore %r14 mov 8*2(%r8), %r13 .cfi_restore %r13 mov 8*3(%r8), %r12 .cfi_restore %r12 mov 8*4(%r8), %rbx .cfi_restore %rbx mov 8*5(%r8), %rbp .cfi_restore %rbp lea 8*6(%r8), %rsp .cfi_adjust_cfa_offset -8*23 .cfi_epilogue ret .cfi_endproc .size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 ___ { my @acc=@acc; # will be rotated locally my $bi = "%rbp"; $code.=<<___; .type __mulq_mont_383_nonred,\@abi-omnipotent .align 32 __mulq_mont_383_nonred: mov %rax, $bi mulq @acc[6] # a[0]*b[0] mov %rax, @acc[0] mov $bi, %rax mov %rdx, @acc[1] mulq @acc[7] # a[1]*b[0] add %rax, @acc[1] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[2] mulq @acc[4] # a[2]*b[0] add %rax, @acc[2] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[3] mov @acc[0], @acc[7] imulq 8(%rsp), @acc[0] mulq @acc[5] # a[3]*b[0] add %rax, @acc[3] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[4] mulq 8*4($a_ptr) add %rax, @acc[4] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[5] mulq 8*5($a_ptr) add %rax, @acc[5] mov @acc[0], %rax adc \$0, %rdx mov %rdx, @acc[6] ___ for (my $i=0; $i<6;) { my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; $code.=<<___; ################################# reduction $i mulq 8*0($n_ptr) add %rax, @acc[7] # guaranteed to be zero mov @acc[0], %rax adc %rdx, @acc[7] mulq 8*1($n_ptr) add %rax, @acc[1] mov @acc[0], %rax adc \$0, %rdx add @acc[7], @acc[1] adc \$0, %rdx mov %rdx, @acc[7] mulq 8*2($n_ptr) add %rax, @acc[2] mov @acc[0], %rax adc \$0, %rdx add @acc[7], @acc[2] adc \$0, %rdx mov %rdx, @acc[7] mulq 8*3($n_ptr) add @acc[7], @acc[3] adc \$0, %rdx add %rax, @acc[3] mov @acc[0], %rax adc \$0, %rdx mov %rdx, @acc[7] mulq 8*4($n_ptr) add %rax, @acc[4] mov @acc[0], %rax adc \$0, %rdx add @acc[7], @acc[4] adc \$0, %rdx mov %rdx, @acc[7] mulq 8*5($n_ptr) add %rax, @acc[5] mov $b_next, %rax adc \$0, %rdx add @acc[7], @acc[5] adc %rdx, @acc[6] ___ push(@acc,shift(@acc)); $code.=<<___ if ($i++<5); ################################# Multiply by b[$i] mov %rax, $bi mulq 8*0($a_ptr) add %rax, @acc[0] mov $bi, %rax adc \$0, %rdx mov %rdx, @acc[6] mulq 8*1($a_ptr) add %rax, @acc[1] mov $bi, %rax adc \$0, %rdx add @acc[6], @acc[1] adc \$0, %rdx mov %rdx, @acc[6] mulq 8*2($a_ptr) add %rax, @acc[2] mov $bi, %rax adc \$0, %rdx add @acc[6], @acc[2] adc \$0, %rdx mov %rdx, @acc[6] mov @acc[0], @acc[7] imulq 8(%rsp), @acc[0] mulq 8*3($a_ptr) add %rax, @acc[3] mov $bi, %rax adc \$0, %rdx add @acc[6], @acc[3] adc \$0, %rdx mov %rdx, @acc[6] mulq 8*4($a_ptr) add %rax, @acc[4] mov $bi, %rax adc \$0, %rdx add @acc[6], @acc[4] adc \$0, %rdx mov %rdx, @acc[6] mulq 8*5($a_ptr) add @acc[6], @acc[5] adc \$0, %rdx add %rax, @acc[5] mov @acc[0], %rax adc \$0, %rdx mov %rdx, @acc[6] ___ } $code.=<<___; ret .size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred ___ } { my $frame = 4*8 + # place for argument off-load + 2*384/8 + # place for 2 384-bit temporary vectors 8; # align my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); # omitting 3 reductions gives 8-11% better performance in add-chains $code.=<<___; .globl sqr_mont_382x .hidden sqr_mont_382x .type sqr_mont_382x,\@function,4,"unwind" .align 32 sqr_mont_382x: .cfi_startproc #ifdef __BLST_PORTABLE__ testl \$1, __blst_platform_cap(%rip) jnz sqr_mont_382x\$1 #endif push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue mov $n_ptr, 8*0(%rsp) # n0 mov $b_org, $n_ptr # n_ptr mov $a_ptr, 8*2(%rsp) mov $r_ptr, 8*3(%rsp) ################################# mov 8*0($a_ptr), @acc[0] # a->re mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov @acc[0], @acc[6] add 8*6($a_ptr), @acc[0] # a->re + a->im mov @acc[1], @acc[7] adc 8*7($a_ptr), @acc[1] mov @acc[2], @acc[8] adc 8*8($a_ptr), @acc[2] mov @acc[3], @acc[9] adc 8*9($a_ptr), @acc[3] mov @acc[4], @acc[10] adc 8*10($a_ptr), @acc[4] mov @acc[5], @acc[11] adc 8*11($a_ptr), @acc[5] sub 8*6($a_ptr), @acc[6] # a->re - a->im sbb 8*7($a_ptr), @acc[7] sbb 8*8($a_ptr), @acc[8] sbb 8*9($a_ptr), @acc[9] sbb 8*10($a_ptr), @acc[10] sbb 8*11($a_ptr), @acc[11] sbb $r_ptr, $r_ptr # borrow flag as mask mov @acc[0], 32+8*0(%rsp) # t0 mov @acc[1], 32+8*1(%rsp) mov @acc[2], 32+8*2(%rsp) mov @acc[3], 32+8*3(%rsp) mov @acc[4], 32+8*4(%rsp) mov @acc[5], 32+8*5(%rsp) mov @acc[6], 32+8*6(%rsp) # t1 mov @acc[7], 32+8*7(%rsp) mov @acc[8], 32+8*8(%rsp) mov @acc[9], 32+8*9(%rsp) mov @acc[10], 32+8*10(%rsp) mov @acc[11], 32+8*11(%rsp) mov $r_ptr, 32+8*12(%rsp) ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); #mov 8*2(%rsp), $a_ptr # a->re lea 48($a_ptr), $b_ptr # a->im mov 48($a_ptr), %rax # a->im mov 8*0($a_ptr), @acc[6] # a->re mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[4] mov 8*3($a_ptr), @acc[5] mov 8*3(%rsp), $r_ptr call __mulq_mont_383_nonred ___ { my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 12,13,"ax","bx","bp","si"); $code.=<<___; add @acc[0], @acc[0] # add with itself adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] mov @acc[0], 8*6($r_ptr) # ret->im mov @acc[1], 8*7($r_ptr) mov @acc[2], 8*8($r_ptr) mov @acc[3], 8*9($r_ptr) mov @acc[4], 8*10($r_ptr) mov @acc[5], 8*11($r_ptr) ___ } $code.=<<___; ################################# mul_mont_384(ret->re, t0, t1, mod, n0); lea 32(%rsp), $a_ptr # t0 lea 32+8*6(%rsp), $b_ptr # t1 mov 32+8*6(%rsp), %rax # t1[0] mov 32+8*0(%rsp), @acc[6] # t0[0..3] mov 32+8*1(%rsp), @acc[7] mov 32+8*2(%rsp), @acc[4] mov 32+8*3(%rsp), @acc[5] call __mulq_mont_383_nonred ___ { my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 12,13,"ax","bx","bp","si"); $code.=<<___; mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im mov 32+8*0(%rsp), @acc[6] mov 32+8*1(%rsp), @acc[7] and @acc[11], @acc[6] mov 32+8*2(%rsp), @acc[8] and @acc[11], @acc[7] mov 32+8*3(%rsp), @acc[9] and @acc[11], @acc[8] mov 32+8*4(%rsp), @acc[10] and @acc[11], @acc[9] and @acc[11], @acc[10] and 32+8*5(%rsp), @acc[11] sub @acc[6], @acc[0] mov 8*0($n_ptr), @acc[6] sbb @acc[7], @acc[1] mov 8*1($n_ptr), @acc[7] sbb @acc[8], @acc[2] mov 8*2($n_ptr), @acc[8] sbb @acc[9], @acc[3] mov 8*3($n_ptr), @acc[9] sbb @acc[10], @acc[4] mov 8*4($n_ptr), @acc[10] sbb @acc[11], @acc[5] sbb @acc[11], @acc[11] and @acc[11], @acc[6] and @acc[11], @acc[7] and @acc[11], @acc[8] and @acc[11], @acc[9] and @acc[11], @acc[10] and 8*5($n_ptr), @acc[11] add @acc[6], @acc[0] adc @acc[7], @acc[1] adc @acc[8], @acc[2] adc @acc[9], @acc[3] adc @acc[10], @acc[4] adc @acc[11], @acc[5] mov @acc[0], 8*0($r_ptr) # ret->re mov @acc[1], 8*1($r_ptr) mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ___ } $code.=<<___; lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size sqr_mont_382x,.-sqr_mont_382x ___ } print $code; close STDOUT; ================================================ FILE: src/asm/mulx_mont_256-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # "Sparse" in subroutine names refers to most significant limb of the # modulus. Though "sparse" is a bit of misnomer, because limitation is # just not-all-ones. Or in other words not larger than 2^256-2^192-1. # In general Montgomery multiplication algorithm can handle one of the # inputs being non-reduced and capped by 1<re, b->re); #lea 0($b_btr), $b_ptr # b->re #lea 0($a_ptr), $a_ptr # a->re lea 40(%rsp), $r_ptr # t0 #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 ################################# mul_384(t1, a->im, b->im); lea 48($b_ptr), $b_ptr # b->im lea 128+48($a_ptr), $a_ptr # a->im lea 96($r_ptr), $r_ptr # t1 call __mulx_384 ################################# mul_384(t2, a->re+a->im, b->re+b->im); mov 8*1(%rsp), $n_ptr lea ($b_ptr), $a_ptr # b->re lea -48($b_ptr), $b_org # b->im lea 40+192+48(%rsp), $r_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 mov 8*3(%rsp), $a_ptr # a->re lea 48($a_ptr), $b_org # a->im lea -48($r_ptr), $r_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif call __addx_mod_384 lea ($r_ptr),$b_ptr lea 48($r_ptr),$a_ptr call __mulx_384 ################################# t2=t2-t0-t1 lea ($r_ptr), $a_ptr # t2 lea 40(%rsp), $b_org # t0 mov 8*1(%rsp), $n_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 # t2-t0 lea ($r_ptr), $a_ptr # t2 lea -96($r_ptr), $b_org # t1 call __subx_mod_384x384 # t2-t0-t1 ################################# t0=t0-t1 lea 40(%rsp), $a_ptr lea 40+96(%rsp), $b_org lea 40(%rsp), $r_ptr call __subx_mod_384x384 # t0-t1 lea ($n_ptr), $b_ptr # n_ptr for redc_mont_384 ################################# redc_mont_384(ret->re, t0, mod, n0); lea 40(%rsp), $a_ptr # t0 mov 8*0(%rsp), %rcx # n0 for redc_mont_384 mov 8*4(%rsp), $r_ptr # ret->re call __mulx_by_1_mont_384 call __redx_tail_mont_384 ################################# redc_mont_384(ret->im, t2, mod, n0); lea 40+192(%rsp), $a_ptr # t2 mov 8*0(%rsp), %rcx # n0 for redc_mont_384 lea 48($r_ptr), $r_ptr # ret->im call __mulx_by_1_mont_384 call __redx_tail_mont_384 lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size mulx_mont_384x,.-mulx_mont_384x ___ } { my $frame = 4*8 + # place for argument off-load + 2*384/8 + # place for 2 384-bit temporary vectors 8; # alignment $code.=<<___; .globl sqrx_mont_384x .hidden sqrx_mont_384x .type sqrx_mont_384x,\@function,4,"unwind" .align 32 sqrx_mont_384x: .cfi_startproc sqr_mont_384x\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue mov $n_ptr, 8*0(%rsp) # n0 mov $b_org, $n_ptr # n_ptr # gap for __mulx_mont_384 mov $r_ptr, 8*2(%rsp) mov $a_ptr, 8*3(%rsp) ################################# add_mod_384(t0, a->re, a->im); lea 48($a_ptr), $b_org # a->im lea 32(%rsp), $r_ptr # t0 call __addx_mod_384 ################################# sub_mod_384(t1, a->re, a->im); mov 8*3(%rsp), $a_ptr # a->re lea 48($a_ptr), $b_org # a->im lea 32+48(%rsp), $r_ptr # t1 call __subx_mod_384 ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); mov 8*3(%rsp), $a_ptr # a->re lea 48($a_ptr), $b_ptr # a->im #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 48($a_ptr), %rdx mov 8*0($a_ptr), %r14 # @acc[6] mov 8*1($a_ptr), %r15 # @acc[7] mov 8*2($a_ptr), %rax # @acc[8] mov 8*3($a_ptr), %r12 # @acc[4] mov 8*4($a_ptr), %rdi # $lo mov 8*5($a_ptr), %rbp # $hi lea -128($a_ptr), $a_ptr # control u-op density lea -128($n_ptr), $n_ptr # control u-op density mulx %r14, %r8, %r9 call __mulx_mont_384 ___ { my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 8..11,13,14); $code.=<<___; add @acc[0], @acc[0] # add with itself adc @acc[1], @acc[1] adc @acc[2], @acc[2] mov @acc[0], @acc[6] adc @acc[3], @acc[3] mov @acc[1], @acc[7] adc @acc[4], @acc[4] mov @acc[2], @acc[8] adc @acc[5], @acc[5] mov @acc[3], @acc[9] sbb $a_ptr, $a_ptr sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[10] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], @acc[11] sbb 8*5($n_ptr), @acc[5] sbb \$0, $a_ptr cmovc @acc[6], @acc[0] cmovc @acc[7], @acc[1] cmovc @acc[8], @acc[2] mov @acc[0], 8*6($b_ptr) # ret->im cmovc @acc[9], @acc[3] mov @acc[1], 8*7($b_ptr) cmovc @acc[10], @acc[4] mov @acc[2], 8*8($b_ptr) cmovc @acc[11], @acc[5] mov @acc[3], 8*9($b_ptr) mov @acc[4], 8*10($b_ptr) mov @acc[5], 8*11($b_ptr) ___ } $code.=<<___; ################################# mul_mont_384(ret->re, t0, t1, mod, n0); lea 32(%rsp), $a_ptr # t0 lea 32+48(%rsp), $b_ptr # t1 mov 32+48(%rsp), %rdx # t1[0] mov 32+8*0(%rsp), %r14 # @acc[6] mov 32+8*1(%rsp), %r15 # @acc[7] mov 32+8*2(%rsp), %rax # @acc[8] mov 32+8*3(%rsp), %r12 # @acc[4] mov 32+8*4(%rsp), %rdi # $lo mov 32+8*5(%rsp), %rbp # $hi lea -128($a_ptr), $a_ptr # control u-op density lea -128($n_ptr), $n_ptr # control u-op density mulx %r14, %r8, %r9 call __mulx_mont_384 lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size sqrx_mont_384x,.-sqrx_mont_384x .globl mulx_382x .hidden mulx_382x .type mulx_382x,\@function,4,"unwind" .align 32 mulx_382x: .cfi_startproc mul_382x\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue lea 96($r_ptr), $r_ptr # ret->im mov $a_ptr, 8*0(%rsp) mov $b_org, 8*1(%rsp) mov $r_ptr, 8*2(%rsp) # offload ret->im mov $n_ptr, 8*3(%rsp) ################################# t0 = a->re + a->im #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] add 8*6($a_ptr), @acc[0] adc 8*7($a_ptr), @acc[1] adc 8*8($a_ptr), @acc[2] adc 8*9($a_ptr), @acc[3] adc 8*10($a_ptr), @acc[4] adc 8*11($a_ptr), @acc[5] mov @acc[0], 32+8*0(%rsp) mov @acc[1], 32+8*1(%rsp) mov @acc[2], 32+8*2(%rsp) mov @acc[3], 32+8*3(%rsp) mov @acc[4], 32+8*4(%rsp) mov @acc[5], 32+8*5(%rsp) ################################# t1 = b->re + b->im mov 8*0($b_org), @acc[0] mov 8*1($b_org), @acc[1] mov 8*2($b_org), @acc[2] mov 8*3($b_org), @acc[3] mov 8*4($b_org), @acc[4] mov 8*5($b_org), @acc[5] add 8*6($b_org), @acc[0] adc 8*7($b_org), @acc[1] adc 8*8($b_org), @acc[2] adc 8*9($b_org), @acc[3] adc 8*10($b_org), @acc[4] adc 8*11($b_org), @acc[5] mov @acc[0], 32+8*6(%rsp) mov @acc[1], 32+8*7(%rsp) mov @acc[2], 32+8*8(%rsp) mov @acc[3], 32+8*9(%rsp) mov @acc[4], 32+8*10(%rsp) mov @acc[5], 32+8*11(%rsp) ################################# mul_384(ret->im, t0, t1); lea 32+8*0(%rsp), $a_ptr # t0 lea 32+8*6(%rsp), $b_ptr # t1 call __mulx_384 ################################# mul_384(ret->re, a->re, b->re); mov 8*0(%rsp), $a_ptr mov 8*1(%rsp), $b_ptr lea -96($r_ptr), $r_ptr # ret->re #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 ################################# mul_384(tx, a->im, b->im); lea 48+128($a_ptr), $a_ptr lea 48($b_ptr), $b_ptr lea 32(%rsp), $r_ptr call __mulx_384 ################################# ret->im -= tx mov 8*2(%rsp), $a_ptr # restore ret->im lea 32(%rsp), $b_org mov 8*3(%rsp), $n_ptr mov $a_ptr, $r_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif call __subx_mod_384x384 ################################# ret->im -= ret->re lea 0($r_ptr), $a_ptr lea -96($r_ptr), $b_org call __subx_mod_384x384 ################################# ret->re -= tx lea -96($r_ptr), $a_ptr lea 32(%rsp), $b_org lea -96($r_ptr), $r_ptr call __subx_mod_384x384 lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size mulx_382x,.-mulx_382x ___ } { my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected # except for $n_ptr and $r_ptr $code.=<<___; .globl sqrx_382x .hidden sqrx_382x .type sqrx_382x,\@function,3,"unwind" .align 32 sqrx_382x: .cfi_startproc sqr_382x\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $a_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr ################################# t0 = a->re + a->im #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[6] mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov 8*3($a_ptr), @acc[9] mov 8*4($a_ptr), @acc[10] mov 8*5($a_ptr), @acc[11] mov @acc[6], @acc[0] add 8*6($a_ptr), @acc[6] mov @acc[7], @acc[1] adc 8*7($a_ptr), @acc[7] mov @acc[8], @acc[2] adc 8*8($a_ptr), @acc[8] mov @acc[9], @acc[3] adc 8*9($a_ptr), @acc[9] mov @acc[10], @acc[4] adc 8*10($a_ptr), @acc[10] mov @acc[11], @acc[5] adc 8*11($a_ptr), @acc[11] mov @acc[6], 8*0($r_ptr) mov @acc[7], 8*1($r_ptr) mov @acc[8], 8*2($r_ptr) mov @acc[9], 8*3($r_ptr) mov @acc[10], 8*4($r_ptr) mov @acc[11], 8*5($r_ptr) ################################# t1 = a->re - a->im lea 48($a_ptr), $b_org lea 48($r_ptr), $r_ptr call __subx_mod_384_a_is_loaded ################################# mul_384(ret->re, t0, t1); lea ($r_ptr), $a_ptr lea -48($r_ptr), $b_ptr lea -48($r_ptr), $r_ptr call __mulx_384 ################################# mul_384(ret->im, a->re, a->im); mov (%rsp), $a_ptr lea 48($a_ptr), $b_ptr lea 96($r_ptr), $r_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 mov 8*0($r_ptr), @acc[0] # double ret->im mov 8*1($r_ptr), @acc[1] mov 8*2($r_ptr), @acc[2] mov 8*3($r_ptr), @acc[3] mov 8*4($r_ptr), @acc[4] mov 8*5($r_ptr), @acc[5] mov 8*6($r_ptr), @acc[6] mov 8*7($r_ptr), @acc[7] mov 8*8($r_ptr), @acc[8] mov 8*9($r_ptr), @acc[9] mov 8*10($r_ptr), @acc[10] add @acc[0], @acc[0] mov 8*11($r_ptr), @acc[11] adc @acc[1], @acc[1] mov @acc[0], 8*0($r_ptr) adc @acc[2], @acc[2] mov @acc[1], 8*1($r_ptr) adc @acc[3], @acc[3] mov @acc[2], 8*2($r_ptr) adc @acc[4], @acc[4] mov @acc[3], 8*3($r_ptr) adc @acc[5], @acc[5] mov @acc[4], 8*4($r_ptr) adc @acc[6], @acc[6] mov @acc[5], 8*5($r_ptr) adc @acc[7], @acc[7] mov @acc[6], 8*6($r_ptr) adc @acc[8], @acc[8] mov @acc[7], 8*7($r_ptr) adc @acc[9], @acc[9] mov @acc[8], 8*8($r_ptr) adc @acc[10], @acc[10] mov @acc[9], 8*9($r_ptr) adc @acc[11], @acc[11] mov @acc[10], 8*10($r_ptr) mov @acc[11], 8*11($r_ptr) mov 8*1(%rsp),%r15 .cfi_restore %r15 mov 8*2(%rsp),%r14 .cfi_restore %r14 mov 8*3(%rsp),%r13 .cfi_restore %r13 mov 8*4(%rsp),%r12 .cfi_restore %r12 mov 8*5(%rsp),%rbx .cfi_restore %rbx mov 8*6(%rsp),%rbp .cfi_restore %rbp lea 8*7(%rsp),%rsp .cfi_adjust_cfa_offset -8*7 .cfi_epilogue ret .cfi_endproc .size sqrx_382x,.-sqrx_382x ___ } { ########################################################## 384-bit mulx my ($a0, $a1) = @acc[6..7]; my @acc = @acc[0..5]; my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp"); $code.=<<___; .globl mulx_384 .hidden mulx_384 .type mulx_384,\@function,3,"unwind" .align 32 mulx_384: .cfi_startproc mul_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .cfi_end_prologue mov $b_org, $b_ptr # evacuate from %rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_384 mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .cfi_epilogue ret .cfi_endproc .size mulx_384,.-mulx_384 .type __mulx_384,\@abi-omnipotent .align 32 __mulx_384: mov 8*0($b_ptr), %rdx mov 8*0($a_ptr), $a0 mov 8*1($a_ptr), $a1 mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] lea -128($a_ptr), $a_ptr mulx $a0, @acc[1], $hi xor $zr, $zr mulx $a1, @acc[0], $lo adcx $hi, @acc[0] mov @acc[1], 8*0($r_ptr) mulx @acc[2], @acc[1], $hi adcx $lo, @acc[1] mulx @acc[3], @acc[2], $lo adcx $hi, @acc[2] mulx @acc[4], @acc[3], $hi adcx $lo, @acc[3] mulx @acc[5], @acc[4], @acc[5] mov 8*1($b_ptr), %rdx adcx $hi, @acc[4] adcx $zr, @acc[5] ___ for(my $i=1; $i<6; $i++) { my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; $code.=<<___; mulx $a0, $lo, $hi adcx @acc[0], $lo adox $hi, @acc[1] mov $lo, 8*$i($r_ptr) mulx $a1, @acc[0], $hi adcx @acc[1], $acc[0] adox $hi, @acc[2] mulx 128+8*2($a_ptr), @acc[1], $lo adcx @acc[2], @acc[1] adox $lo, @acc[3] mulx 128+8*3($a_ptr), @acc[2], $hi adcx @acc[3], @acc[2] adox $hi, @acc[4] mulx 128+8*4($a_ptr), @acc[3], $lo adcx @acc[4], @acc[3] adox @acc[5], $lo mulx 128+8*5($a_ptr), @acc[4], @acc[5] mov $b_next, %rdx adcx $lo, @acc[4] adox $zr, @acc[5] adcx $zr, @acc[5] ___ } $code.=<<___; mov @acc[0], 8*6($r_ptr) mov @acc[1], 8*7($r_ptr) mov @acc[2], 8*8($r_ptr) mov @acc[3], 8*9($r_ptr) mov @acc[4], 8*10($r_ptr) mov @acc[5], 8*11($r_ptr) ret .size __mulx_384,.-__mulx_384 ___ } { ########################################################## 384-bit sqrx $code.=<<___; .globl sqrx_384 .hidden sqrx_384 .type sqrx_384,\@function,2,"unwind" .align 32 sqrx_384: .cfi_startproc sqr_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push $r_ptr .cfi_adjust_cfa_offset 8 .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif call __sqrx_384 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sqrx_384,.-sqrx_384 ___ if (0) { # up to 5% slower than below variant my @acc=map("%r$_",("no",8..15,"cx","bx")); push(@acc, $a_ptr); my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno"); $code.=<<___; .type __sqrx_384,\@abi-omnipotent .align 32 __sqrx_384: mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov 8*3($a_ptr), @acc[9] mov 8*4($a_ptr), @acc[10] ######################################### mulx @acc[7], @acc[1], $lo # a[1]*a[0] mov 8*5($a_ptr), @acc[11] mulx @acc[8], @acc[2], $hi # a[2]*a[0] add $lo, @acc[2] mulx @acc[9], @acc[3], $lo # a[3]*a[0] adc $hi, @acc[3] mulx @acc[10], @acc[4], $hi # a[4]*a[0] adc $lo, @acc[4] mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] adc $hi, @acc[5] adc \$0, @acc[6] mulx %rdx, $lo, $hi # a[0]*a[0] mov @acc[7], %rdx xor @acc[7], @acc[7] add @acc[1], @acc[1] # double acc[1] adc \$0, @acc[7] add $hi, @acc[1] adc \$0, @acc[7] mov $lo, 8*0($r_ptr) mov @acc[1], 8*1($r_ptr) ___ ($carry, @acc[7]) = (@acc[7], @acc[1]); $code.=<<___; ######################################### xor @acc[7], @acc[7] mulx @acc[8], $lo, $hi # a[2]*a[1] adcx $lo, @acc[3] adox $hi, @acc[4] mulx @acc[9], $lo, $hi # a[3]*a[1] adcx $lo, @acc[4] adox $hi, @acc[5] mulx @acc[10], $lo, $hi # a[4]*a[1] adcx $lo, @acc[5] adox $hi, @acc[6] mulx @acc[11], $lo, $hi # a[5]*a[1] adcx $lo, @acc[6] adox @acc[7], $hi adcx $hi, @acc[7] mulx %rdx, $lo, $hi # a[1]*a[1] mov @acc[8], %rdx xor @acc[8], @acc[8] adox @acc[2], @acc[2] # double acc[2:3] adcx $carry, $lo # can't carry adox @acc[3], @acc[3] adcx $lo, @acc[2] adox @acc[8], @acc[8] adcx $hi, @acc[3] adc \$0, @acc[8] mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) ___ ($carry,@acc[8])=(@acc[8],$carry); $code.=<<___; ######################################### xor @acc[8], @acc[8] mulx @acc[9], $lo, $hi # a[3]*a[2] adcx $lo, @acc[5] adox $hi, @acc[6] mulx @acc[10], $lo, $hi # a[4]*a[2] adcx $lo, @acc[6] adox $hi, @acc[7] mulx @acc[11], $lo, $hi # a[5]*a[2] adcx $lo, @acc[7] adox @acc[8], $hi adcx $hi, @acc[8] mulx %rdx, $lo, $hi # a[2]*a[2] mov @acc[9], %rdx xor @acc[9], @acc[9] adox @acc[4], @acc[4] # double acc[4:5] adcx $carry, $lo # can't carry adox @acc[5], @acc[5] adcx $lo, @acc[4] adox @acc[9], @acc[9] adcx $hi, @acc[5] adc \$0, $acc[9] mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ___ ($carry,@acc[9])=(@acc[9],$carry); $code.=<<___; ######################################### xor @acc[9], @acc[9] mulx @acc[10], $lo, $hi # a[4]*a[3] adcx $lo, @acc[7] adox $hi, @acc[8] mulx @acc[11], $lo, $hi # a[5]*a[3] adcx $lo, @acc[8] adox @acc[9], $hi adcx $hi, @acc[9] mulx %rdx, $lo, $hi mov @acc[10], %rdx xor @acc[10], @acc[10] adox @acc[6], @acc[6] # double acc[6:7] adcx $carry, $lo # can't carry adox @acc[7], @acc[7] adcx $lo, @acc[6] adox @acc[10], @acc[10] adcx $hi, @acc[7] adc \$0, $acc[10] mov @acc[6], 8*6($r_ptr) mov @acc[7], 8*7($r_ptr) ___ ($carry,@acc[10])=(@acc[10],$carry); $code.=<<___; ######################################### mulx @acc[11], $lo, @acc[10] # a[5]*a[4] add $lo, @acc[9] adc \$0, @acc[10] mulx %rdx, $lo, $hi # a[4]*a[4] mov @acc[11], %rdx xor @acc[11], @acc[11] adox @acc[8], @acc[8] # double acc[8:10] adcx $carry, $lo # can't carry adox @acc[9], @acc[9] adcx $lo, @acc[8] adox @acc[10], @acc[10] adcx $hi, @acc[9] adox @acc[11], @acc[11] mov @acc[8], 8*8($r_ptr) mov @acc[9], 8*9($r_ptr) ######################################### mulx %rdx, $lo, $hi # a[5]*a[5] adcx $lo, @acc[10] adcx $hi, @acc[11] mov @acc[10], 8*10($r_ptr) mov @acc[11], 8*11($r_ptr) ret .size __sqrx_384,.-__sqrx_384 ___ } else { my @acc=map("%r$_",("no",8..15,"cx","bx","bp")); my ($lo, $hi)=($r_ptr, "%rax"); $code.=<<___; .type __sqrx_384,\@abi-omnipotent .align 32 __sqrx_384: mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov 8*3($a_ptr), @acc[9] mov 8*4($a_ptr), @acc[10] ######################################### mulx @acc[7], @acc[1], $lo # a[1]*a[0] mov 8*5($a_ptr), @acc[11] mulx @acc[8], @acc[2], $hi # a[2]*a[0] add $lo, @acc[2] mulx @acc[9], @acc[3], $lo # a[3]*a[0] adc $hi, @acc[3] mulx @acc[10], @acc[4], $hi # a[4]*a[0] adc $lo, @acc[4] mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] mov @acc[7], %rdx adc $hi, @acc[5] adc \$0, @acc[6] ######################################### xor @acc[7], @acc[7] mulx @acc[8], $lo, $hi # a[2]*a[1] adcx $lo, @acc[3] adox $hi, @acc[4] mulx @acc[9], $lo, $hi # a[3]*a[1] adcx $lo, @acc[4] adox $hi, @acc[5] mulx @acc[10], $lo, $hi # a[4]*a[1] adcx $lo, @acc[5] adox $hi, @acc[6] mulx @acc[11], $lo, $hi # a[5]*a[1] mov @acc[8], %rdx adcx $lo, @acc[6] adox @acc[7], $hi adcx $hi, @acc[7] ######################################### xor @acc[8], @acc[8] mulx @acc[9], $lo, $hi # a[3]*a[2] adcx $lo, @acc[5] adox $hi, @acc[6] mulx @acc[10], $lo, $hi # a[4]*a[2] adcx $lo, @acc[6] adox $hi, @acc[7] mulx @acc[11], $lo, $hi # a[5]*a[2] mov @acc[9], %rdx adcx $lo, @acc[7] adox @acc[8], $hi adcx $hi, @acc[8] ######################################### xor @acc[9], @acc[9] mulx @acc[10], $lo, $hi # a[4]*a[3] adcx $lo, @acc[7] adox $hi, @acc[8] mulx @acc[11], $lo, $hi # a[5]*a[3] mov @acc[10], %rdx adcx $lo, @acc[8] adox @acc[9], $hi adcx $hi, @acc[9] ######################################### mulx @acc[11], $lo, @acc[10] # a[5]*a[4] mov 8*0($a_ptr), %rdx add $lo, @acc[9] mov 8(%rsp), $r_ptr # restore $r_ptr adc \$0, @acc[10] ######################################### double acc[1:10] xor @acc[11], @acc[11] adcx @acc[1], @acc[1] adcx @acc[2], @acc[2] adcx @acc[3], @acc[3] adcx @acc[4], @acc[4] adcx @acc[5], @acc[5] ######################################### accumulate a[i]*a[i] mulx %rdx, %rdx, $hi # a[0]*a[0] mov %rdx, 8*0($r_ptr) mov 8*1($a_ptr), %rdx adox $hi, @acc[1] mov @acc[1], 8*1($r_ptr) mulx %rdx, @acc[1], $hi # a[1]*a[1] mov 8*2($a_ptr), %rdx adox @acc[1], @acc[2] adox $hi, @acc[3] mov @acc[2], 8*2($r_ptr) mov @acc[3], 8*3($r_ptr) mulx %rdx, @acc[1], @acc[2] # a[2]*a[2] mov 8*3($a_ptr), %rdx adox @acc[1], @acc[4] adox @acc[2], @acc[5] adcx @acc[6], @acc[6] adcx @acc[7], @acc[7] mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) mulx %rdx, @acc[1], @acc[2] # a[3]*a[3] mov 8*4($a_ptr), %rdx adox @acc[1], @acc[6] adox @acc[2], @acc[7] adcx @acc[8], @acc[8] adcx @acc[9], @acc[9] mov @acc[6], 8*6($r_ptr) mov @acc[7], 8*7($r_ptr) mulx %rdx, @acc[1], @acc[2] # a[4]*a[4] mov 8*5($a_ptr), %rdx adox @acc[1], @acc[8] adox @acc[2], @acc[9] adcx @acc[10], @acc[10] adcx @acc[11], @acc[11] mov @acc[8], 8*8($r_ptr) mov @acc[9], 8*9($r_ptr) mulx %rdx, @acc[1], @acc[2] # a[5]*a[5] adox @acc[1], @acc[10] adox @acc[2], @acc[11] mov @acc[10], 8*10($r_ptr) mov @acc[11], 8*11($r_ptr) ret .size __sqrx_384,.-__sqrx_384 ___ } { ########################################################## 384-bit redcx_mont my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" my ($lo, $hi) = ("%rax", "%rbp"); $code.=<<___; ######################################################################## # void redcx_mont_384(uint64_t ret[6], const uint64_t a[12], # uint64_t m[6], uint64_t n0); .globl redcx_mont_384 .hidden redcx_mont_384 .type redcx_mont_384,\@function,4,"unwind" .align 32 redcx_mont_384: .cfi_startproc redc_mont_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 call __redx_tail_mont_384 mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size redcx_mont_384,.-redcx_mont_384 ######################################################################## # void fromx_mont_384(uint64_t ret[6], const uint64_t a[6], # uint64_t m[6], uint64_t n0); .globl fromx_mont_384 .hidden fromx_mont_384 .type fromx_mont_384,\@function,4,"unwind" .align 32 fromx_mont_384: .cfi_startproc from_mont_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $b_org, $n_ptr #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 ################################# # Branch-less conditional acc[0:6] - modulus mov @acc[6], %rax mov @acc[7], %rcx mov @acc[0], %rdx mov @acc[1], %rbp sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[7] mov @acc[2], @acc[5] sbb 8*2($n_ptr), @acc[0] sbb 8*3($n_ptr), @acc[1] sbb 8*4($n_ptr), @acc[2] mov @acc[3], $a_ptr sbb 8*5($n_ptr), @acc[3] cmovc %rax, @acc[6] cmovc %rcx, @acc[7] cmovc %rdx, @acc[0] mov @acc[6], 8*0($r_ptr) cmovc %rbp, @acc[1] mov @acc[7], 8*1($r_ptr) cmovc @acc[5], @acc[2] mov @acc[0], 8*2($r_ptr) cmovc $a_ptr, @acc[3] mov @acc[1], 8*3($r_ptr) mov @acc[2], 8*4($r_ptr) mov @acc[3], 8*5($r_ptr) mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size fromx_mont_384,.-fromx_mont_384 ___ { my @acc=@acc; # will be rotated locally $code.=<<___; .type __mulx_by_1_mont_384,\@abi-omnipotent .align 32 __mulx_by_1_mont_384: mov 8*0($a_ptr), @acc[0] mov $n0, %rdx mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] ___ for (my $i=0; $i<6; $i++) { $code.=<<___; imulq @acc[0], %rdx ################################# reduction $i xor @acc[6], @acc[6] # @acc[6]=0, cf=0, of=0 mulx 8*0($n_ptr), $lo, $hi adcx $lo, @acc[0] # guaranteed to be zero adox $hi, @acc[1] mulx 8*1($n_ptr), $lo, $hi adcx $lo, @acc[1] adox $hi, @acc[2] mulx 8*2($n_ptr), $lo, $hi adcx $lo, @acc[2] adox $hi, @acc[3] mulx 8*3($n_ptr), $lo, $hi adcx $lo, @acc[3] adox $hi, @acc[4] mulx 8*4($n_ptr), $lo, $hi adcx $lo, @acc[4] adox $hi, @acc[5] mulx 8*5($n_ptr), $lo, $hi mov $n0, %rdx adcx $lo, @acc[5] adox @acc[6], $hi adcx $hi, @acc[6] ___ push(@acc,shift(@acc)); } $code.=<<___; ret .size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 .type __redx_tail_mont_384,\@abi-omnipotent .align 32 __redx_tail_mont_384: add 8*6($a_ptr), @acc[0] # accumulate upper half mov @acc[0], %rax adc 8*7($a_ptr), @acc[1] adc 8*8($a_ptr), @acc[2] adc 8*9($a_ptr), @acc[3] mov @acc[1], %rcx adc 8*10($a_ptr), @acc[4] adc 8*11($a_ptr), @acc[5] sbb @acc[6], @acc[6] ################################# # Branch-less conditional acc[0:6] - modulus mov @acc[2], %rdx mov @acc[3], %rbp sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] mov @acc[4], @acc[7] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] mov @acc[5], $a_ptr sbb 8*5($n_ptr), @acc[5] sbb \$0, @acc[6] cmovc %rax, @acc[0] cmovc %rcx, @acc[1] cmovc %rdx, @acc[2] mov @acc[0], 8*0($r_ptr) cmovc %rbp, @acc[3] mov @acc[1], 8*1($r_ptr) cmovc @acc[7], @acc[4] mov @acc[2], 8*2($r_ptr) cmovc $a_ptr, @acc[5] mov @acc[3], 8*3($r_ptr) mov @acc[4], 8*4($r_ptr) mov @acc[5], 8*5($r_ptr) ret .size __redx_tail_mont_384,.-__redx_tail_mont_384 .globl sgn0x_pty_mont_384 .hidden sgn0x_pty_mont_384 .type sgn0x_pty_mont_384,\@function,3,"unwind" .align 32 sgn0x_pty_mont_384: .cfi_startproc sgn0_pty_mont_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $a_ptr, $n_ptr lea 0($r_ptr), $a_ptr mov $b_org, $n0 #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 xor %rax, %rax mov @acc[0], @acc[7] add @acc[0], @acc[0] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, %rax sub 8*0($n_ptr), @acc[0] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, %rax not %rax # 2*x > p, which means "negative" and \$1, @acc[7] and \$2, %rax or @acc[7], %rax # pack sign and parity mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 .globl sgn0x_pty_mont_384x .hidden sgn0x_pty_mont_384x .type sgn0x_pty_mont_384x,\@function,3,"unwind" .align 32 sgn0x_pty_mont_384x: .cfi_startproc sgn0_pty_mont_384x\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$8, %rsp .cfi_adjust_cfa_offset 8 .cfi_end_prologue mov $a_ptr, $n_ptr lea 48($r_ptr), $a_ptr # sgn0(a->im) mov $b_org, $n0 #ifdef __SGX_LVI_HARDENING__ lfence #endif call __mulx_by_1_mont_384 mov @acc[0], @acc[6] or @acc[1], @acc[0] or @acc[2], @acc[0] or @acc[3], @acc[0] or @acc[4], @acc[0] or @acc[5], @acc[0] lea 0($r_ptr), $a_ptr # sgn0(a->re) xor $r_ptr, $r_ptr mov @acc[6], @acc[7] add @acc[6], @acc[6] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, $r_ptr sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, $r_ptr mov @acc[0], 0(%rsp) # a->im is zero or not not $r_ptr # 2*x > p, which means "negative" and \$1, @acc[7] and \$2, $r_ptr or @acc[7], $r_ptr # pack sign and parity call __mulx_by_1_mont_384 mov @acc[0], @acc[6] or @acc[1], @acc[0] or @acc[2], @acc[0] or @acc[3], @acc[0] or @acc[4], @acc[0] or @acc[5], @acc[0] xor %rax, %rax mov @acc[6], @acc[7] add @acc[6], @acc[6] adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] adc \$0, %rax sub 8*0($n_ptr), @acc[6] sbb 8*1($n_ptr), @acc[1] sbb 8*2($n_ptr), @acc[2] sbb 8*3($n_ptr), @acc[3] sbb 8*4($n_ptr), @acc[4] sbb 8*5($n_ptr), @acc[5] sbb \$0, %rax mov 0(%rsp), @acc[6] not %rax # 2*x > p, which means "negative" test @acc[0], @acc[0] cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) test @acc[6], @acc[6] cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) and \$1, @acc[7] and \$2, %rax or @acc[7], %rax # pack sign and parity mov 8(%rsp),%r15 .cfi_restore %r15 mov 16(%rsp),%r14 .cfi_restore %r14 mov 24(%rsp),%r13 .cfi_restore %r13 mov 32(%rsp),%r12 .cfi_restore %r12 mov 40(%rsp),%rbx .cfi_restore %rbx mov 48(%rsp),%rbp .cfi_restore %rbp lea 56(%rsp),%rsp .cfi_adjust_cfa_offset -56 .cfi_epilogue ret .cfi_endproc .size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x ___ } } { ########################################################## mulx/sqrx_mont my @acc = (@acc, "%rax"); my ($lo,$hi)=("%rdi","%rbp"); $code.=<<___; .globl mulx_mont_384 .hidden mulx_mont_384 .type mulx_mont_384,\@function,5,"unwind" .align 32 mulx_mont_384: .cfi_startproc mul_mont_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*3(%rsp), %rsp .cfi_adjust_cfa_offset 8*3 .cfi_end_prologue mov $b_org, $b_ptr # evacuate from %rdx #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($b_org), %rdx mov 8*0($a_ptr), @acc[6] mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov 8*3($a_ptr), @acc[4] mov $r_ptr, 8*2(%rsp) mov 8*4($a_ptr), $lo mov 8*5($a_ptr), $hi lea -128($a_ptr), $a_ptr # control u-op density lea -128($n_ptr), $n_ptr # control u-op density mov $n0, (%rsp) mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] call __mulx_mont_384 mov 8*3(%rsp),%r15 .cfi_restore %r15 mov 8*4(%rsp),%r14 .cfi_restore %r14 mov 8*5(%rsp),%r13 .cfi_restore %r13 mov 8*6(%rsp),%r12 .cfi_restore %r12 mov 8*7(%rsp),%rbx .cfi_restore %rbx mov 8*8(%rsp),%rbp .cfi_restore %rbp lea 8*9(%rsp),%rsp .cfi_adjust_cfa_offset -8*9 .cfi_epilogue ret .cfi_endproc .size mulx_mont_384,.-mulx_mont_384 ___ { my @acc=@acc; # will be rotated locally $code.=<<___; .type __mulx_mont_384,\@abi-omnipotent .align 32 __mulx_mont_384: .cfi_startproc mulx @acc[7], @acc[6], @acc[2] mulx @acc[8], @acc[7], @acc[3] add @acc[6], @acc[1] mulx @acc[4], @acc[8], @acc[4] adc @acc[7], @acc[2] mulx $lo, $lo, @acc[5] adc @acc[8], @acc[3] mulx $hi, $hi, @acc[6] mov 8($b_ptr), %rdx adc $lo, @acc[4] adc $hi, @acc[5] adc \$0, @acc[6] xor @acc[7], @acc[7] ___ for (my $i=1; $i<6; $i++) { my $tt = $i==1 ? @acc[7] : $hi; my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; $code.=<<___; mov @acc[0], 16(%rsp) imulq 8(%rsp), @acc[0] ################################# Multiply by b[$i] xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 mulx 8*0+128($a_ptr), $lo, $hi adox $lo, @acc[1] adcx $hi, @acc[2] mulx 8*1+128($a_ptr), $lo, $hi adox $lo, @acc[2] adcx $hi, @acc[3] mulx 8*2+128($a_ptr), $lo, $hi adox $lo, @acc[3] adcx $hi, @acc[4] mulx 8*3+128($a_ptr), $lo, $hi adox $lo, @acc[4] adcx $hi, @acc[5] mulx 8*4+128($a_ptr), $lo, $hi adox $lo, @acc[5] adcx $hi, @acc[6] mulx 8*5+128($a_ptr), $lo, $hi mov @acc[0], %rdx adox $lo, @acc[6] adcx $hi, @acc[7] # cf=0 adox @acc[8], @acc[7] adox @acc[8], @acc[8] ################################# reduction xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 mulx 8*0+128($n_ptr), $lo, $hi adcx 16(%rsp), $lo # guaranteed to be zero adox $hi, @acc[1] mulx 8*1+128($n_ptr), $lo, $hi adcx $lo, @acc[1] adox $hi, @acc[2] mulx 8*2+128($n_ptr), $lo, $hi adcx $lo, @acc[2] adox $hi, @acc[3] mulx 8*3+128($n_ptr), $lo, $hi adcx $lo, @acc[3] adox $hi, @acc[4] mulx 8*4+128($n_ptr), $lo, $hi adcx $lo, @acc[4] adox $hi, @acc[5] mulx 8*5+128($n_ptr), $lo, $hi mov $b_next, %rdx adcx $lo, @acc[5] adox $hi, @acc[6] adcx @acc[0], @acc[6] adox @acc[0], @acc[7] adcx @acc[0], @acc[7] adox @acc[0], @acc[8] adcx @acc[0], @acc[8] ___ push(@acc,shift(@acc)); } $code.=<<___; imulq 8(%rsp), %rdx mov 8*3(%rsp), $b_ptr # restore $r_ptr ################################# last reduction xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 mulx 8*0+128($n_ptr), $lo, $hi adcx $lo, @acc[0] # guaranteed to be zero adox $hi, @acc[1] mulx 8*1+128($n_ptr), $lo, $hi adcx $lo, @acc[1] adox $hi, @acc[2] mulx 8*2+128($n_ptr), $lo, $hi adcx $lo, @acc[2] adox $hi, @acc[3] mulx 8*3+128($n_ptr), $lo, $hi adcx $lo, @acc[3] adox $hi, @acc[4] mov @acc[2], @acc[0] mulx 8*4+128($n_ptr), $lo, $hi adcx $lo, @acc[4] adox $hi, @acc[5] mov @acc[3], $a_ptr mulx 8*5+128($n_ptr), $lo, $hi adcx $lo, @acc[5] adox $hi, @acc[6] mov @acc[1], %rdx adcx @acc[8], @acc[6] adox @acc[8], @acc[7] lea 128($n_ptr), $n_ptr mov @acc[4], @acc[8] adc \$0, @acc[7] ################################# # Branch-less conditional acc[1:7] - modulus sub 8*0($n_ptr), @acc[1] sbb 8*1($n_ptr), @acc[2] mov @acc[5], $lo sbb 8*2($n_ptr), @acc[3] sbb 8*3($n_ptr), @acc[4] sbb 8*4($n_ptr), @acc[5] mov @acc[6], $hi sbb 8*5($n_ptr), @acc[6] sbb \$0, @acc[7] cmovnc @acc[1], %rdx cmovc @acc[0], @acc[2] cmovc $a_ptr, @acc[3] cmovnc @acc[4], @acc[8] mov %rdx, 8*0($b_ptr) cmovnc @acc[5], $lo mov @acc[2], 8*1($b_ptr) cmovnc @acc[6], $hi mov @acc[3], 8*2($b_ptr) mov @acc[8], 8*3($b_ptr) mov $lo, 8*4($b_ptr) mov $hi, 8*5($b_ptr) ret # __SGX_LVI_HARDENING_CLOBBER__=%rsi .cfi_endproc .size __mulx_mont_384,.-__mulx_mont_384 ___ } $code.=<<___; .globl sqrx_mont_384 .hidden sqrx_mont_384 .type sqrx_mont_384,\@function,4,"unwind" .align 32 sqrx_mont_384: .cfi_startproc sqr_mont_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*3(%rsp), %rsp .cfi_adjust_cfa_offset 8*3 .cfi_end_prologue mov $n_ptr, $n0 # n0 lea -128($b_org), $n_ptr # control u-op density #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov 8*3($a_ptr), @acc[4] mov $r_ptr, 8*2(%rsp) mov 8*4($a_ptr), $lo mov 8*5($a_ptr), $hi lea ($a_ptr), $b_ptr mov $n0, (%rsp) # n0 lea -128($a_ptr), $a_ptr # control u-op density mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] call __mulx_mont_384 # as fast as dedicated squaring mov 8*3(%rsp),%r15 .cfi_restore %r15 mov 8*4(%rsp),%r14 .cfi_restore %r14 mov 8*5(%rsp),%r13 .cfi_restore %r13 mov 8*6(%rsp),%r12 .cfi_restore %r12 mov 8*7(%rsp),%rbx .cfi_restore %rbx mov 8*8(%rsp),%rbp .cfi_restore %rbp lea 8*9(%rsp),%rsp .cfi_adjust_cfa_offset -8*9 .cfi_epilogue ret .cfi_endproc .size sqrx_mont_384,.-sqrx_mont_384 .globl sqrx_n_mul_mont_384 .hidden sqrx_n_mul_mont_384 .type sqrx_n_mul_mont_384,\@function,6,"unwind" .align 32 sqrx_n_mul_mont_384: .cfi_startproc sqr_n_mul_mont_384\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp), %rsp .cfi_adjust_cfa_offset 8*5 .cfi_end_prologue mov $b_org, @acc[2] # loop counter #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov $a_ptr, $b_ptr mov 8*3($a_ptr), @acc[4] mov $r_ptr, 8*2(%rsp) # to __mulx_mont_384 mov 8*4($a_ptr), $lo mov 8*5($a_ptr), $hi mov $n0, (%rsp) mov %r9, 8*3(%rsp) # 6th, multiplicand argument movq 8*0(%r9), %xmm2 # prefetch b[0] .Loop_sqrx_384: movd @acc[2]d, %xmm1 lea -128($b_ptr), $a_ptr # control u-op density lea -128($n_ptr), $n_ptr # control u-op density mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] call __mulx_mont_384 movd %xmm1, @acc[2]d dec @acc[2]d jnz .Loop_sqrx_384 mov %rdx, @acc[6] movq %xmm2, %rdx # b[0] lea -128($b_ptr), $a_ptr # control u-op density mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument lea -128($n_ptr), $n_ptr # control u-op density mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] call __mulx_mont_384 mov 8*5(%rsp),%r15 .cfi_restore %r15 mov 8*6(%rsp),%r14 .cfi_restore %r14 mov 8*7(%rsp),%r13 .cfi_restore %r13 mov 8*8(%rsp),%r12 .cfi_restore %r12 mov 8*9(%rsp),%rbx .cfi_restore %rbx mov 8*10(%rsp),%rbp .cfi_restore %rbp lea 8*11(%rsp),%rsp .cfi_adjust_cfa_offset -8*11 .cfi_epilogue ret .cfi_endproc .size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 .globl sqrx_n_mul_mont_383 .hidden sqrx_n_mul_mont_383 .type sqrx_n_mul_mont_383,\@function,6,"unwind" .align 32 sqrx_n_mul_mont_383: .cfi_startproc sqr_n_mul_mont_383\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp), %rsp .cfi_adjust_cfa_offset 8*5 .cfi_end_prologue mov $b_org, @acc[2] # loop counter #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), @acc[7] mov 8*2($a_ptr), @acc[8] mov $a_ptr, $b_ptr mov 8*3($a_ptr), @acc[4] mov $r_ptr, 8*2(%rsp) # to __mulx_mont_383_nonred mov 8*4($a_ptr), $lo mov 8*5($a_ptr), $hi mov $n0, (%rsp) mov %r9, 8*3(%rsp) # 6th, multiplicand argument movq 8*0(%r9), %xmm2 # prefetch b[0] lea -128($n_ptr), $n_ptr # control u-op density .Loop_sqrx_383: movd @acc[2]d, %xmm1 lea -128($b_ptr), $a_ptr # control u-op density mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] call __mulx_mont_383_nonred # omitting full reduction gives ~15% # in addition-chains movd %xmm1, @acc[2]d dec @acc[2]d jnz .Loop_sqrx_383 mov %rdx, @acc[6] movq %xmm2, %rdx # b[0] lea -128($b_ptr), $a_ptr # control u-op density mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument mulx @acc[6], @acc[0], @acc[1] # a[0]*b[0] call __mulx_mont_384 mov 8*5(%rsp),%r15 .cfi_restore %r15 mov 8*6(%rsp),%r14 .cfi_restore %r14 mov 8*7(%rsp),%r13 .cfi_restore %r13 mov 8*8(%rsp),%r12 .cfi_restore %r12 mov 8*9(%rsp),%rbx .cfi_restore %rbx mov 8*10(%rsp),%rbp .cfi_restore %rbp lea 8*11(%rsp),%rsp .cfi_adjust_cfa_offset -8*11 .cfi_epilogue ret .cfi_endproc .size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 ___ { my @acc=@acc; # will be rotated locally $code.=<<___; .type __mulx_mont_383_nonred,\@abi-omnipotent .align 32 __mulx_mont_383_nonred: .cfi_startproc mulx @acc[7], @acc[6], @acc[2] mulx @acc[8], @acc[7], @acc[3] add @acc[6], @acc[1] mulx @acc[4], @acc[8], @acc[4] adc @acc[7], @acc[2] mulx $lo, $lo, @acc[5] adc @acc[8], @acc[3] mulx $hi, $hi, @acc[6] mov 8($b_ptr), %rdx adc $lo, @acc[4] adc $hi, @acc[5] adc \$0, @acc[6] ___ for (my $i=1; $i<6; $i++) { my $tt = $i==1 ? @acc[7] : $hi; my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; $code.=<<___; mov @acc[0], @acc[8] imulq 8(%rsp), @acc[0] ################################# Multiply by b[$i] xor @acc[7], @acc[7] # @acc[8]=0, cf=0, of=0 mulx 8*0+128($a_ptr), $lo, $hi adox $lo, @acc[1] adcx $hi, @acc[2] mulx 8*1+128($a_ptr), $lo, $hi adox $lo, @acc[2] adcx $hi, @acc[3] mulx 8*2+128($a_ptr), $lo, $hi adox $lo, @acc[3] adcx $hi, @acc[4] mulx 8*3+128($a_ptr), $lo, $hi adox $lo, @acc[4] adcx $hi, @acc[5] mulx 8*4+128($a_ptr), $lo, $hi adox $lo, @acc[5] adcx $hi, @acc[6] mulx 8*5+128($a_ptr), $lo, $hi mov @acc[0], %rdx adox $lo, @acc[6] adcx @acc[7], $hi adox $hi, @acc[7] ################################# reduction xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 mulx 8*0+128($n_ptr), $lo, $hi adcx $lo, @acc[8] # guaranteed to be zero adox $hi, @acc[1] mulx 8*1+128($n_ptr), $lo, $hi adcx $lo, @acc[1] adox $hi, @acc[2] mulx 8*2+128($n_ptr), $lo, $hi adcx $lo, @acc[2] adox $hi, @acc[3] mulx 8*3+128($n_ptr), $lo, $hi adcx $lo, @acc[3] adox $hi, @acc[4] mulx 8*4+128($n_ptr), $lo, $hi adcx $lo, @acc[4] adox $hi, @acc[5] mulx 8*5+128($n_ptr), $lo, $hi mov $b_next, %rdx adcx $lo, @acc[5] adox $hi, @acc[6] adcx @acc[8], @acc[6] adox @acc[8], @acc[7] adcx @acc[8], @acc[7] ___ push(@acc,shift(@acc)); } $code.=<<___; imulq 8(%rsp), %rdx mov 8*3(%rsp), $b_ptr # restore $r_ptr ################################# last reduction xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 mulx 8*0+128($n_ptr), $lo, $hi adcx $lo, @acc[0] # guaranteed to be zero adox $hi, @acc[1] mulx 8*1+128($n_ptr), $lo, $hi adcx $lo, @acc[1] adox $hi, @acc[2] mulx 8*2+128($n_ptr), $lo, $hi adcx $lo, @acc[2] adox $hi, @acc[3] mulx 8*3+128($n_ptr), $lo, $hi adcx $lo, @acc[3] adox $hi, @acc[4] mulx 8*4+128($n_ptr), $lo, $hi adcx $lo, @acc[4] adox $hi, @acc[5] mulx 8*5+128($n_ptr), $lo, $hi mov @acc[1], %rdx adcx $lo, @acc[5] adox $hi, @acc[6] adc \$0, @acc[6] mov @acc[4], @acc[8] mov @acc[1], 8*0($b_ptr) mov @acc[2], 8*1($b_ptr) mov @acc[3], 8*2($b_ptr) mov @acc[5], $lo mov @acc[4], 8*3($b_ptr) mov @acc[5], 8*4($b_ptr) mov @acc[6], 8*5($b_ptr) mov @acc[6], $hi ret # __SGX_LVI_HARDENING_CLOBBER__=%rsi .cfi_endproc .size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred ___ } } } { my $frame = 4*8 + # place for argument off-load + 2*384/8 + # place for 2 384-bit temporary vectors 8; # align my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); # omitting 3 reductions gives ~10% better performance in add-chains $code.=<<___; .globl sqrx_mont_382x .hidden sqrx_mont_382x .type sqrx_mont_382x,\@function,4,"unwind" .align 32 sqrx_mont_382x: .cfi_startproc sqr_mont_382x\$1: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue mov $n_ptr, 8*0(%rsp) # n0 mov $b_org, $n_ptr # n_ptr mov $r_ptr, 8*2(%rsp) mov $a_ptr, 8*3(%rsp) ################################# #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 8*0($a_ptr), @acc[0] # a->re mov 8*1($a_ptr), @acc[1] mov 8*2($a_ptr), @acc[2] mov 8*3($a_ptr), @acc[3] mov 8*4($a_ptr), @acc[4] mov 8*5($a_ptr), @acc[5] mov @acc[0], @acc[6] add 8*6($a_ptr), @acc[0] # a->re + a->im mov @acc[1], @acc[7] adc 8*7($a_ptr), @acc[1] mov @acc[2], @acc[8] adc 8*8($a_ptr), @acc[2] mov @acc[3], @acc[9] adc 8*9($a_ptr), @acc[3] mov @acc[4], @acc[10] adc 8*10($a_ptr), @acc[4] mov @acc[5], @acc[11] adc 8*11($a_ptr), @acc[5] sub 8*6($a_ptr), @acc[6] # a->re - a->im sbb 8*7($a_ptr), @acc[7] sbb 8*8($a_ptr), @acc[8] sbb 8*9($a_ptr), @acc[9] sbb 8*10($a_ptr), @acc[10] sbb 8*11($a_ptr), @acc[11] sbb $r_ptr, $r_ptr # borrow flag as mask mov @acc[0], 32+8*0(%rsp) # t0 mov @acc[1], 32+8*1(%rsp) mov @acc[2], 32+8*2(%rsp) mov @acc[3], 32+8*3(%rsp) mov @acc[4], 32+8*4(%rsp) mov @acc[5], 32+8*5(%rsp) mov @acc[6], 32+8*6(%rsp) # t1 mov @acc[7], 32+8*7(%rsp) mov @acc[8], 32+8*8(%rsp) mov @acc[9], 32+8*9(%rsp) mov @acc[10], 32+8*10(%rsp) mov @acc[11], 32+8*11(%rsp) mov $r_ptr, 32+8*12(%rsp) ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); #mov 8*3(%rsp), $a_ptr # a->re lea 48($a_ptr), $b_ptr # a->im mov 48($a_ptr), %rdx mov 8*0($a_ptr), %r14 # @acc[6] mov 8*1($a_ptr), %r15 # @acc[7] mov 8*2($a_ptr), %rax # @acc[8] mov 8*3($a_ptr), %r12 # @acc[4] mov 8*4($a_ptr), %rdi # $lo mov 8*5($a_ptr), %rbp # $hi lea -128($a_ptr), $a_ptr # control u-op density lea -128($n_ptr), $n_ptr # control u-op density mulx %r14, %r8, %r9 call __mulx_mont_383_nonred ___ { my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 8..11,13,14); $code.=<<___; add @acc[0], @acc[0] # add with itself adc @acc[1], @acc[1] adc @acc[2], @acc[2] adc @acc[3], @acc[3] adc @acc[4], @acc[4] adc @acc[5], @acc[5] mov @acc[0], 8*6($b_ptr) # ret->im mov @acc[1], 8*7($b_ptr) mov @acc[2], 8*8($b_ptr) mov @acc[3], 8*9($b_ptr) mov @acc[4], 8*10($b_ptr) mov @acc[5], 8*11($b_ptr) ___ } $code.=<<___; ################################# mul_mont_384(ret->re, t0, t1, mod, n0); lea 32-128(%rsp), $a_ptr # t0 [+u-op density] lea 32+8*6(%rsp), $b_ptr # t1 mov 32+8*6(%rsp), %rdx # t1[0] mov 32+8*0(%rsp), %r14 # @acc[6] mov 32+8*1(%rsp), %r15 # @acc[7] mov 32+8*2(%rsp), %rax # @acc[8] mov 32+8*3(%rsp), %r12 # @acc[4] mov 32+8*4(%rsp), %rdi # $lo mov 32+8*5(%rsp), %rbp # $hi #lea -128($a_ptr), $a_ptr # control u-op density #lea -128($n_ptr), $n_ptr # control u-op density mulx %r14, %r8, %r9 call __mulx_mont_383_nonred ___ { my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 8..11,13,14); $code.=<<___; mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im lea 128($n_ptr), $n_ptr mov 32+8*0(%rsp), @acc[6] and @acc[11], @acc[6] mov 32+8*1(%rsp), @acc[7] and @acc[11], @acc[7] mov 32+8*2(%rsp), @acc[8] and @acc[11], @acc[8] mov 32+8*3(%rsp), @acc[9] and @acc[11], @acc[9] mov 32+8*4(%rsp), @acc[10] and @acc[11], @acc[10] and 32+8*5(%rsp), @acc[11] sub @acc[6], @acc[0] mov 8*0($n_ptr), @acc[6] sbb @acc[7], @acc[1] mov 8*1($n_ptr), @acc[7] sbb @acc[8], @acc[2] mov 8*2($n_ptr), @acc[8] sbb @acc[9], @acc[3] mov 8*3($n_ptr), @acc[9] sbb @acc[10], @acc[4] mov 8*4($n_ptr), @acc[10] sbb @acc[11], @acc[5] sbb @acc[11], @acc[11] and @acc[11], @acc[6] and @acc[11], @acc[7] and @acc[11], @acc[8] and @acc[11], @acc[9] and @acc[11], @acc[10] and 8*5($n_ptr), @acc[11] add @acc[6], @acc[0] adc @acc[7], @acc[1] adc @acc[8], @acc[2] adc @acc[9], @acc[3] adc @acc[10], @acc[4] adc @acc[11], @acc[5] mov @acc[0], 8*0($b_ptr) # ret->re mov @acc[1], 8*1($b_ptr) mov @acc[2], 8*2($b_ptr) mov @acc[3], 8*3($b_ptr) mov @acc[4], 8*4($b_ptr) mov @acc[5], 8*5($b_ptr) ___ } $code.=<<___; lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size sqrx_mont_382x,.-sqrx_mont_382x ___ } print $code; close STDOUT; ================================================ FILE: src/asm/sha256-armv8.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # ==================================================================== # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL # project. # ==================================================================== # # sha256_block procedure for ARMv8. # # This module is stripped of scalar code paths, with rationale that all # known processors are NEON-capable. # # See original module at CRYPTOGAMS for further details. $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } $BITS=256; $SZ=4; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; $reg_t="w"; $pre="blst_"; ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); $code.=<<___; .comm __blst_platform_cap,4 .text .align 6 .type .LK$BITS,%object .LK$BITS: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK$BITS,.-.LK$BITS .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm" .align 2 ___ if ($SZ==4) { my $Ktbl="x3"; my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); my @MSG=map("v$_.16b",(4..7)); my ($W0,$W1)=("v16.4s","v17.4s"); my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); $code.=<<___; .globl ${pre}sha256_block_armv8 .hidden ${pre}sha256_block_armv8 .type ${pre}sha256_block_armv8,%function .align 6 ${pre}sha256_block_armv8: hint #34 .Lv8_entry: stp c29,c30,[csp,#-2*__SIZEOF_POINTER__]! add c29,csp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] adr $Ktbl,.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 sub $num,$num,#1 ld1.32 {$W0},[$Ktbl],#16 rev32 @MSG[0],@MSG[0] rev32 @MSG[1],@MSG[1] rev32 @MSG[2],@MSG[2] rev32 @MSG[3],@MSG[3] orr $ABCD_SAVE,$ABCD,$ABCD // offload orr $EFGH_SAVE,$EFGH,$EFGH ___ for($i=0;$i<12;$i++) { $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] sha256su0 @MSG[0],@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 sha256su1 @MSG[0],@MSG[2],@MSG[3] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); } $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 ld1.32 {$W0},[$Ktbl],#16 add.i32 $W1,$W1,@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 ld1.32 {$W1},[$Ktbl] add.i32 $W0,$W0,@MSG[2] sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 add.i32 $W1,$W1,@MSG[3] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 add.i32 $ABCD,$ABCD,$ABCD_SAVE add.i32 $EFGH,$EFGH,$EFGH_SAVE cbnz $num,.Loop_hw st1.32 {$ABCD,$EFGH},[$ctx] ldr c29,[csp],#2*__SIZEOF_POINTER__ ret .size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8 ___ } if ($SZ==4) { ######################################### NEON stuff # # You'll surely note a lot of similarities with sha256-armv4 module, # and of course it's not a coincidence. sha256-armv4 was used as # initial template, but was adapted for ARMv8 instruction set and # extensively re-tuned for all-round performance. my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); my $Ktbl="x16"; my $Xfer="x17"; my @X = map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); my $j=0; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); &ushr_32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] eval(shift(@insns)); &sli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T4,$T7,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T4,$T7,32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T5,$T7,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T7,$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_u32 ($T3,$T7,32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T6,@X[0],$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T7,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T6,@X[0],32-$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T5,@X[0],$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T7,$T7,$T6); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T5,@X[0],32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl], #16"); eval(shift(@insns)); &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T5); eval(shift(@insns)); eval(shift(@insns)); &mov (&Dhi($T5), &Dlo($T7)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); while($#insns>=1) { eval(shift(@insns)); } &st1_32 ("{$T0}","[$Xfer], #16"); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); &ld1_8 ("{@X[0]}","[$inp],#16"); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl],#16"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &rev32 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &st1_32 ("{$T0}","[$Xfer], #16"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past '&and ($t1,$f,$e)', '&bic ($t4,$g,$e)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&orr ($t1,$t1,$t4)', # Ch(e,f,g) '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ror ($t0,$t0,"#$Sigma1[0]")', '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t0)', # h+=Sigma1(e) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&ror ($t4,$t4,"#$Sigma0[0]")', '&add ($d,$d,$h)', # d+=h '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; .globl ${pre}sha256_block_data_order .hidden ${pre}sha256_block_data_order .type ${pre}sha256_block_data_order,%function .align 4 ${pre}sha256_block_data_order: hint #34 adrp c16,__blst_platform_cap ldr w16,[c16,#:lo12:__blst_platform_cap] tst w16,#1 b.ne .Lv8_entry stp c29, c30, [csp, #-2*__SIZEOF_POINTER__]! mov c29, csp sub csp,csp,#16*4 adr $Ktbl,.LK256 add $num,$inp,$num,lsl#6 // len to point at the end of inp ld1.8 {@X[0]},[$inp], #16 ld1.8 {@X[1]},[$inp], #16 ld1.8 {@X[2]},[$inp], #16 ld1.8 {@X[3]},[$inp], #16 ld1.32 {$T0},[$Ktbl], #16 ld1.32 {$T1},[$Ktbl], #16 ld1.32 {$T2},[$Ktbl], #16 ld1.32 {$T3},[$Ktbl], #16 rev32 @X[0],@X[0] // yes, even on rev32 @X[1],@X[1] // big-endian rev32 @X[2],@X[2] rev32 @X[3],@X[3] cmov $Xfer,sp add.32 $T0,$T0,@X[0] add.32 $T1,$T1,@X[1] add.32 $T2,$T2,@X[2] st1.32 {$T0-$T1},[$Xfer], #32 add.32 $T3,$T3,@X[3] st1.32 {$T2-$T3},[$Xfer] csub $Xfer,$Xfer,#32 ldp $A,$B,[$ctx] ldp $C,$D,[$ctx,#8] ldp $E,$F,[$ctx,#16] ldp $G,$H,[$ctx,#24] ldr $t1,[sp,#0] mov $t2,wzr eor $t3,$B,$C mov $t4,wzr b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; cmp $t1,#0 // check for K256 terminator ldr $t1,[sp,#0] csub $Xfer,$Xfer,#64 bne .L_00_48 csub $Ktbl,$Ktbl,#256 // rewind $Ktbl cmp $inp,$num mov $Xfer, #-64 csel $Xfer, $Xfer, xzr, eq cadd $inp,$inp,$Xfer // avoid SEGV cmov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; add $A,$A,$t4 // h+=Sigma0(a) from the past ldp $t0,$t1,[$ctx,#0] add $A,$A,$t2 // h+=Maj(a,b,c) from the past ldp $t2,$t3,[$ctx,#8] add $A,$A,$t0 // accumulate add $B,$B,$t1 ldp $t0,$t1,[$ctx,#16] add $C,$C,$t2 add $D,$D,$t3 ldp $t2,$t3,[$ctx,#24] add $E,$E,$t0 add $F,$F,$t1 ldr $t1,[sp,#0] stp $A,$B,[$ctx,#0] add $G,$G,$t2 mov $t2,wzr stp $C,$D,[$ctx,#8] add $H,$H,$t3 stp $E,$F,[$ctx,#16] eor $t3,$B,$C stp $G,$H,[$ctx,#24] mov $t4,wzr cmov $Xfer,sp b.ne .L_00_48 ldr c29,[c29] add csp,csp,#16*4+2*__SIZEOF_POINTER__ ret .size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order ___ } { my ($out,$inp,$len) = map("x$_",(0..2)); $code.=<<___; .globl ${pre}sha256_emit .hidden ${pre}sha256_emit .type ${pre}sha256_emit,%function .align 4 ${pre}sha256_emit: hint #34 ldp x4,x5,[$inp] ldp x6,x7,[$inp,#16] #ifndef __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 #endif str w4,[$out,#4] lsr x4,x4,#32 str w5,[$out,#12] lsr x5,x5,#32 str w6,[$out,#20] lsr x6,x6,#32 str w7,[$out,#28] lsr x7,x7,#32 str w4,[$out,#0] str w5,[$out,#8] str w6,[$out,#16] str w7,[$out,#24] ret .size ${pre}sha256_emit,.-${pre}sha256_emit .globl ${pre}sha256_bcopy .hidden ${pre}sha256_bcopy .type ${pre}sha256_bcopy,%function .align 4 ${pre}sha256_bcopy: hint #34 .Loop_bcopy: ldrb w3,[$inp],#1 sub $len,$len,#1 strb w3,[$out],#1 cbnz $len,.Loop_bcopy ret .size ${pre}sha256_bcopy,.-${pre}sha256_bcopy .globl ${pre}sha256_hcopy .hidden ${pre}sha256_hcopy .type ${pre}sha256_hcopy,%function .align 4 ${pre}sha256_hcopy: hint #34 ldp x4,x5,[$inp] ldp x6,x7,[$inp,#16] stp x4,x5,[$out] stp x6,x7,[$out,#16] ret .size ${pre}sha256_hcopy,.-${pre}sha256_hcopy ___ } { my %opcode = ( "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); sub unsha256 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } } open SELF,$0; while() { next if (/^#!/); last if (!s/^#/\/\// and !/^$/); print; } close SELF; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers s/\.[ui]?8(\s)/$1/; s/\.\w?64\b// and s/\.16b/\.2d/g or s/\.\w?32\b// and s/\.16b/\.4s/g; m/\bext\b/ and s/\.2d/\.16b/g or m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; } close STDOUT; ================================================ FILE: src/asm/sha256-portable-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # ==================================================================== # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL # project. # ==================================================================== # # sha256_block procedure for x86_64. # # Scalar-only version with minor twist minimizing 'lea' instructions. $flavour = shift; $output = pop; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; $pre="blst_"; $func="${pre}sha256_block_data_order"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; $ctx="%rdi"; # 1st arg, zapped by $a3 $inp="%rsi"; # 2nd arg $Tbl="%rbp"; $_ctx="16*$SZ+0*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $framesz="16*$SZ+3*8"; sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my $STRIDE=$SZ; # $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); $code.=<<___; ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 xor $e,$a0 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $g,$a2 # f^g mov $T1,`$SZ*($i&0xf)`(%rsp) xor $a,$a1 and $e,$a2 # (f^g)&e ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 add $h,$T1 # T1+=h xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 xor $e,$a0 add $a2,$T1 # T1+=Ch(e,f,g) mov $a,$a2 add `$SZ*$i`($Tbl),$T1 # T1+=K[round] xor $a,$a1 xor $b,$a2 # a^b, b^c in next round ror \$$Sigma1[0],$a0 # Sigma1(e) mov $b,$h and $a2,$a3 ror \$$Sigma0[0],$a1 # Sigma0(a) add $a0,$T1 # T1+=Sigma1(e) xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) add $T1,$d # d+=T1 add $T1,$h # h+=T1 ___ $code.=<<___ if ($i==31); lea `16*$SZ`($Tbl),$Tbl # round+=16 ___ $code.=<<___ if ($i<15); add $a1,$h # h+=Sigma0(a) ___ ($a2,$a3) = ($a3,$a2); } sub ROUND_16_XX() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 mov $a0,$T1 ror \$`$sigma0[1]-$sigma0[0]`,$a0 add $a1,$a # modulo-scheduled h+=Sigma0(a) mov $a2,$a1 ror \$`$sigma1[1]-$sigma1[0]`,$a2 xor $T1,$a0 shr \$$sigma0[2],$T1 ror \$$sigma0[0],$a0 xor $a1,$a2 shr \$$sigma1[2],$a1 ror \$$sigma1[0],$a2 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) xor $a1,$a2 # sigma1(X[(i+14)&0xf]) add `$SZ*(($i+9)&0xf)`(%rsp),$T1 add `$SZ*($i&0xf)`(%rsp),$T1 mov $e,$a0 add $a2,$T1 mov $a,$a1 ___ &ROUND_00_15(@_); } $code=<<___; .comm __blst_platform_cap,4 .text .globl $func .type $func,\@function,3,"unwind" .align 16 $func: .cfi_startproc push %rbp .cfi_push %rbp mov %rsp,%rbp .cfi_def_cfa_register %rbp #ifdef __BLST_PORTABLE__ testl \$2,__blst_platform_cap(%rip) jnz .L${func}\$2 #endif push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$$framesz,%rsp .cfi_alloca $framesz .cfi_def_cfa %rsp .cfi_end_prologue lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H jmp .Lloop .align 16 .Lloop: mov $B,$a3 lea $TABLE(%rip),$Tbl xor $C,$a3 # magic ___ for($i=0;$i<16;$i++) { $code.=" mov $SZ*$i($inp),$T1\n"; $code.=" mov @ROT[4],$a0\n"; $code.=" mov @ROT[0],$a1\n"; $code.=" bswap $T1\n"; &ROUND_00_15($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: ___ for(;$i<32;$i++) { &ROUND_16_XX($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; cmpb \$0x19,`$SZ-1`($Tbl) jnz .Lrounds_16_xx mov $_ctx,$ctx add $a1,$A # modulo-scheduled h+=Sigma0(a) lea 16*$SZ($inp),$inp add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop lea $framesz+6*8(%rsp),%r11 .cfi_def_cfa %r11,8 mov $framesz(%rsp),%r15 mov -40(%r11),%r14 mov -32(%r11),%r13 mov -24(%r11),%r12 mov -16(%r11),%rbx mov -8(%r11),%rbp .cfi_epilogue lea (%r11),%rsp ret .cfi_endproc .size $func,.-$func #ifndef __BLST_PORTABLE__ .section .rodata .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" ___ { my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order ("%rdi","%rsi","%rdx"); # Unix order $code.=<<___; .globl ${pre}sha256_emit .hidden ${pre}sha256_emit .type ${pre}sha256_emit,\@abi-omnipotent .align 16 ${pre}sha256_emit: mov 0($inp), %r8 mov 8($inp), %r9 mov 16($inp), %r10 bswap %r8 mov 24($inp), %r11 bswap %r9 mov %r8d, 4($out) bswap %r10 mov %r9d, 12($out) bswap %r11 mov %r10d, 20($out) shr \$32, %r8 mov %r11d, 28($out) shr \$32, %r9 mov %r8d, 0($out) shr \$32, %r10 mov %r9d, 8($out) shr \$32, %r11 mov %r10d, 16($out) mov %r11d, 24($out) ret .size ${pre}sha256_emit,.-${pre}sha256_emit .globl ${pre}sha256_bcopy .hidden ${pre}sha256_bcopy .type ${pre}sha256_bcopy,\@abi-omnipotent .align 16 ${pre}sha256_bcopy: sub $inp, $out .Loop_bcopy: movzb ($inp), %eax lea 1($inp), $inp mov %al, -1($out,$inp) dec $len jnz .Loop_bcopy ret .size ${pre}sha256_bcopy,.-${pre}sha256_bcopy .globl ${pre}sha256_hcopy .hidden ${pre}sha256_hcopy .type ${pre}sha256_hcopy,\@abi-omnipotent .align 16 ${pre}sha256_hcopy: mov 0($inp), %r8 mov 8($inp), %r9 mov 16($inp), %r10 mov 24($inp), %r11 mov %r8, 0($out) mov %r9, 8($out) mov %r10, 16($out) mov %r11, 24($out) ret .size ${pre}sha256_hcopy,.-${pre}sha256_hcopy #endif ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; print $_,"\n"; } close STDOUT; ================================================ FILE: src/asm/sha256-x86_64.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # ==================================================================== # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL # project. # ==================================================================== # # sha256_block procedure for x86_64. # # This module is stripped of AVX and even scalar code paths, with # rationale that # # a) AVX1 is [justifiably] faster than SSSE3 code path only on *one* # processor, venerable Sandy Bridge; # b) AVX2 incurs costly power transitions, which would be justifiable # if AVX2 code was executing most of the time, which is not the # case in the context; # c) all contemporary processors support SSSE3, so that nobody would # actually use scalar code path anyway; # # See original module at CRYPTOGAMS for further details. $flavour = shift; $output = pop; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; $pre="blst_"; $func="${pre}sha256_block_data_order"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; $ctx="%rdi"; # 1st arg, zapped by $a3 $inp="%rsi"; # 2nd arg $Tbl="%rbp"; $_ctx="16*$SZ+0*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $framesz="16*$SZ+3*8"; $code=<<___; .comm __blst_platform_cap,4 .section .rodata .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" .text ___ ###################################################################### # SIMD code paths # {{{ ###################################################################### # Intel SHA Extensions implementation of SHA256 update function. # my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); my @MSG=map("%xmm$_",(3..6)); $code.=<<___; .globl ${pre}sha256_block_data_order_shaext .hidden ${pre}sha256_block_data_order_shaext .type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind" .align 64 ${pre}sha256_block_data_order_shaext: .cfi_startproc push %rbp .cfi_push %rbp mov %rsp,%rbp .cfi_def_cfa_register %rbp .L${func}\$2: ___ $code.=<<___ if ($win64); sub \$0x50,%rsp .cfi_alloca 0x50 movaps %xmm6,-0x50(%rbp) movaps %xmm7,-0x40(%rbp) movaps %xmm8,-0x30(%rbp) movaps %xmm9,-0x20(%rbp) movaps %xmm10,-0x10(%rbp) .cfi_offset %xmm6-%xmm10,-0x60 ___ $code.=<<___; .cfi_end_prologue #ifdef __SGX_LVI_HARDENING__ lfence #endif lea K256+0x80(%rip),$Tbl movdqu ($ctx),$ABEF # DCBA movdqu 16($ctx),$CDGH # HGFE movdqa 0x100-0x80($Tbl),$TMP # byte swap mask pshufd \$0x1b,$ABEF,$Wi # ABCD pshufd \$0xb1,$ABEF,$ABEF # CDAB pshufd \$0x1b,$CDGH,$CDGH # EFGH movdqa $TMP,$BSWAP # offload palignr \$8,$CDGH,$ABEF # ABEF punpcklqdq $Wi,$CDGH # CDGH jmp .Loop_shaext .align 16 .Loop_shaext: movdqu ($inp),@MSG[0] movdqu 0x10($inp),@MSG[1] movdqu 0x20($inp),@MSG[2] pshufb $TMP,@MSG[0] movdqu 0x30($inp),@MSG[3] movdqa 0*16-0x80($Tbl),$Wi paddd @MSG[0],$Wi pshufb $TMP,@MSG[1] movdqa $CDGH,$CDGH_SAVE # offload sha256rnds2 $ABEF,$CDGH # 0-3 pshufd \$0x0e,$Wi,$Wi nop movdqa $ABEF,$ABEF_SAVE # offload sha256rnds2 $CDGH,$ABEF movdqa 1*16-0x80($Tbl),$Wi paddd @MSG[1],$Wi pshufb $TMP,@MSG[2] sha256rnds2 $ABEF,$CDGH # 4-7 pshufd \$0x0e,$Wi,$Wi lea 0x40($inp),$inp sha256msg1 @MSG[1],@MSG[0] sha256rnds2 $CDGH,$ABEF movdqa 2*16-0x80($Tbl),$Wi paddd @MSG[2],$Wi pshufb $TMP,@MSG[3] sha256rnds2 $ABEF,$CDGH # 8-11 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[3],$TMP palignr \$4,@MSG[2],$TMP nop paddd $TMP,@MSG[0] sha256msg1 @MSG[2],@MSG[1] sha256rnds2 $CDGH,$ABEF movdqa 3*16-0x80($Tbl),$Wi paddd @MSG[3],$Wi sha256msg2 @MSG[3],@MSG[0] sha256rnds2 $ABEF,$CDGH # 12-15 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[0],$TMP palignr \$4,@MSG[3],$TMP nop paddd $TMP,@MSG[1] sha256msg1 @MSG[3],@MSG[2] sha256rnds2 $CDGH,$ABEF ___ for($i=4;$i<16-3;$i++) { $code.=<<___; movdqa $i*16-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 16-19... pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP nop paddd $TMP,@MSG[2] sha256msg1 @MSG[0],@MSG[3] sha256rnds2 $CDGH,$ABEF ___ push(@MSG,shift(@MSG)); } $code.=<<___; movdqa 13*16-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 52-55 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP sha256rnds2 $CDGH,$ABEF paddd $TMP,@MSG[2] movdqa 14*16-0x80($Tbl),$Wi paddd @MSG[1],$Wi sha256rnds2 $ABEF,$CDGH # 56-59 pshufd \$0x0e,$Wi,$Wi sha256msg2 @MSG[1],@MSG[2] movdqa $BSWAP,$TMP sha256rnds2 $CDGH,$ABEF movdqa 15*16-0x80($Tbl),$Wi paddd @MSG[2],$Wi nop sha256rnds2 $ABEF,$CDGH # 60-63 pshufd \$0x0e,$Wi,$Wi dec $num nop sha256rnds2 $CDGH,$ABEF paddd $CDGH_SAVE,$CDGH paddd $ABEF_SAVE,$ABEF jnz .Loop_shaext pshufd \$0xb1,$CDGH,$CDGH # DCHG pshufd \$0x1b,$ABEF,$TMP # FEBA pshufd \$0xb1,$ABEF,$ABEF # BAFE punpckhqdq $CDGH,$ABEF # DCBA palignr \$8,$TMP,$CDGH # HGFE movdqu $ABEF,($ctx) movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); movaps -0x50(%rbp),%xmm6 movaps -0x40(%rbp),%xmm7 movaps -0x30(%rbp),%xmm8 movaps -0x20(%rbp),%xmm9 movaps -0x10(%rbp),%xmm10 mov %rbp,%rsp ___ $code.=<<___; .cfi_def_cfa_register %rsp pop %rbp .cfi_pop %rbp .cfi_epilogue ret .cfi_endproc .size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext ___ }}} {{{ my $a4=$T1; my ($a,$b,$c,$d,$e,$f,$g,$h); sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&ror ($a0,$Sigma1[2]-$Sigma1[1])', '&mov ($a,$a1)', '&mov ($a4,$f)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', '&xor ($a0,$e)', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', '&xor ($a1,$a)', '&and ($a4,$e)', # (f^g)&e '&xor ($a0,$e)', '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a2,$b)', # a^b, b^c in next round '&add ($h,$a4)', # h+=Ch(e,f,g) '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&ror ($a1,$Sigma0[0])', # Sigma0(a) '&add ($d,$h)', # d+=h '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', '&add ($a1,$h);'. # h+=Sigma0(a) '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); } ###################################################################### # SSSE3 code path # { my $Tbl = $inp; my $_ctx="-64(%rbp)"; my $_inp="-56(%rbp)"; my $_end="-48(%rbp)"; my $framesz=3*8+$win64*16*4; my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; .globl ${func} .hidden ${func} .type ${func},\@function,3,"unwind" .align 64 ${func}: .cfi_startproc push %rbp .cfi_push %rbp mov %rsp,%rbp .cfi_def_cfa_register %rbp #ifndef __SGX_LVI_HARDENING__ testl \$2,__blst_platform_cap(%rip) jnz .L${func}\$2 #endif push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$$framesz,%rsp .cfi_alloca $framesz lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ mov $ctx,$_ctx # save ctx, 1st arg #mov $inp,$_inp # save inp, 2nd arg mov %rdx,$_end # save end pointer, "3rd" arg ___ $code.=<<___ if ($win64); movaps %xmm6,-0x80(%rbp) movaps %xmm7,-0x70(%rbp) movaps %xmm8,-0x60(%rbp) movaps %xmm9,-0x50(%rbp) .cfi_offset %xmm6-%xmm9,-0x90 ___ $code.=<<___; .cfi_end_prologue lea -16*$SZ(%rsp),%rsp #ifdef __SGX_LVI_HARDENING__ lfence #endif mov $SZ*0($ctx),$A and \$-64,%rsp # align stack mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ $code.=<<___; #movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4 #movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5 jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 mov $inp,$_inp # offload $inp movdqu 0x00($inp),@X[0] movdqu 0x10($inp),@X[1] movdqu 0x20($inp),@X[2] pshufb $t3,@X[0] movdqu 0x30($inp),@X[3] lea $TABLE(%rip),$Tbl pshufb $t3,@X[1] movdqa 0x00($Tbl),$t0 movdqa 0x10($Tbl),$t1 pshufb $t3,@X[2] paddd @X[0],$t0 movdqa 0x20($Tbl),$t2 pshufb $t3,@X[3] movdqa 0x30($Tbl),$t3 paddd @X[1],$t1 paddd @X[2],$t2 paddd @X[3],$t3 movdqa $t0,0x00(%rsp) mov $A,$a1 movdqa $t1,0x10(%rsp) mov $B,$a3 movdqa $t2,0x20(%rsp) xor $C,$a3 # magic movdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lssse3_00_47 .align 16 .Lssse3_00_47: sub \$`-16*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_SSSE3 () { ( '&movdqa ($t0,@X[1]);', '&movdqa ($t3,@X[3])', '&palignr ($t0,@X[0],$SZ)', # X[1..4] '&palignr ($t3,@X[2],$SZ);', # X[9..12] '&movdqa ($t1,$t0)', '&movdqa ($t2,$t0);', '&psrld ($t0,$sigma0[2])', '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] '&psrld ($t2,$sigma0[0])', '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] '&pslld ($t1,8*$SZ-$sigma0[1]);'. '&pxor ($t0,$t2)', '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t1)', '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t2);', '&movdqa ($t2,$t3)', '&pxor ($t0,$t1);', # sigma0(X[1..4]) '&psrld ($t3,$sigma1[2])', '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2)', '&pshufb ($t3,$t4)', # sigma1(X[14..15]) '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] '&movdqa ($t2,$t3);', '&psrld ($t3,$sigma1[2])', '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2);', '&movdqa ($t2,16*$j."($Tbl)")', '&pshufb ($t3,$t5)', '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) ); } sub SSSE3_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions if (0) { foreach (Xupdate_256_SSSE3()) { # 36 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } } else { # squeeze extra 4% on Westmere and 19% on Atom eval(shift(@insns)); #@ &movdqa ($t0,@X[1]); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t3,@X[3]); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &palignr ($t0,@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t1,$t0); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t2,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[4..15] eval(shift(@insns)); eval(shift(@insns)); #@ &pslld ($t1,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrld ($t2,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t1); # sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); #&pshufb ($t3,$t4); # sigma1(X[14..15]) &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ #&pshufb ($t3,$t5); &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,16*$j."($Tbl)"); eval(shift(@insns)); #@ eval(shift(@insns)); &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); } &paddd ($t2,@X[0]); foreach (@insns) { eval; } # remaining instructions &movdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &SSSE3_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*$SZ."($Tbl)",0); &jne (".Lssse3_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A mov $_inp,$inp #ifdef __SGX_LVI_HARDENING__ lfence #endif add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H lea 16*$SZ($inp),$inp cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_ssse3 xorps %xmm0, %xmm0 movaps %xmm0, 0x00(%rsp) # scrub the stack movaps %xmm0, 0x10(%rsp) movaps %xmm0, 0x20(%rsp) movaps %xmm0, 0x30(%rsp) ___ $code.=<<___ if ($win64); movaps -0x80(%rbp),%xmm6 movaps -0x70(%rbp),%xmm7 movaps -0x60(%rbp),%xmm8 movaps -0x50(%rbp),%xmm9 ___ $code.=<<___; mov -40(%rbp),%r15 mov -32(%rbp),%r14 mov -24(%rbp),%r13 mov -16(%rbp),%r12 mov -8(%rbp),%rbx mov %rbp,%rsp .cfi_def_cfa_register %rsp pop %rbp .cfi_pop %rbp .cfi_epilogue ret .cfi_endproc .size ${func},.-${func} ___ } }}} { my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order ("%rdi","%rsi","%rdx"); # Unix order $code.=<<___; .globl ${pre}sha256_emit .hidden ${pre}sha256_emit .type ${pre}sha256_emit,\@abi-omnipotent .align 16 ${pre}sha256_emit: #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 0($inp), %r8 mov 8($inp), %r9 mov 16($inp), %r10 bswap %r8 mov 24($inp), %r11 bswap %r9 mov %r8d, 4($out) bswap %r10 mov %r9d, 12($out) bswap %r11 mov %r10d, 20($out) shr \$32, %r8 mov %r11d, 28($out) shr \$32, %r9 mov %r8d, 0($out) shr \$32, %r10 mov %r9d, 8($out) shr \$32, %r11 mov %r10d, 16($out) mov %r11d, 24($out) ret .size ${pre}sha256_emit,.-${pre}sha256_emit .globl ${pre}sha256_bcopy .hidden ${pre}sha256_bcopy .type ${pre}sha256_bcopy,\@abi-omnipotent .align 16 ${pre}sha256_bcopy: #ifdef __SGX_LVI_HARDENING__ lfence #endif sub $inp, $out .Loop_bcopy: movzb ($inp), %eax lea 1($inp), $inp mov %al, -1($out,$inp) dec $len jnz .Loop_bcopy ret .size ${pre}sha256_bcopy,.-${pre}sha256_bcopy .globl ${pre}sha256_hcopy .hidden ${pre}sha256_hcopy .type ${pre}sha256_hcopy,\@abi-omnipotent .align 16 ${pre}sha256_hcopy: #ifdef __SGX_LVI_HARDENING__ lfence #endif mov 0($inp), %r8 mov 8($inp), %r9 mov 16($inp), %r10 mov 24($inp), %r11 mov %r8, 0($out) mov %r9, 8($out) mov %r10, 16($out) mov %r11, 24($out) ret .size ${pre}sha256_hcopy,.-${pre}sha256_hcopy ___ } sub sha256op38 { my $instr = shift; my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { my @opcode=(0x0f,0x38); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; print $_,"\n"; } close STDOUT; ================================================ FILE: src/asm/x86_64-xlate.pl ================================================ #!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm. # # Why AT&T to MASM and not vice versa? Several reasons. Because AT&T # format is way easier to parse. Because it's simpler to "gear" from # Unix ABI to Windows one [see cross-reference "card" at the end of # file]. Because Linux targets were available first... # # In addition the script also "distills" code suitable for GNU # assembler, so that it can be compiled with more rigid assemblers, # such as Solaris /usr/ccs/bin/as. # # This translator is not designed to convert *arbitrary* assembler # code from AT&T format to MASM one. It's designed to convert just # enough to provide for dual-ABI OpenSSL modules development... # There *are* limitations and you might have to modify your assembler # code or this script to achieve the desired result... # # Currently recognized limitations: # # - can't use multiple ops per line; # # Dual-ABI styling rules. # # 1. Adhere to Unix register and stack layout [see cross-reference # ABI "card" at the end for explanation]. # 2. Forget about "red zone," stick to more traditional blended # stack frame allocation. If volatile storage is actually required # that is. If not, just leave the stack as is. # 3. Functions tagged with ".type name,@function" get crafted with # unified Win64 prologue and epilogue automatically. If you want # to take care of ABI differences yourself, tag functions as # ".type name,@abi-omnipotent" instead. # 4. To optimize the Win64 prologue you can specify number of input # arguments as ".type name,@function,N." Keep in mind that if N is # larger than 6, then you *have to* write "abi-omnipotent" code, # because >6 cases can't be addressed with unified prologue. # 5. Name local labels as .L*, do *not* use dynamic labels such as 1: # (sorry about latter). # 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is # required to identify the spots, where to inject Win64 epilogue! # But on the pros, it's then prefixed with rep automatically:-) # 7. Stick to explicit ip-relative addressing. If you have to use # GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. # Both are recognized and translated to proper Win64 addressing # modes. # # 8. In order to provide for structured exception handling unified # Win64 prologue copies %rsp value to %rax. [Unless function is # tagged with additional .type tag.] For further details see SEH # paragraph at the end. # 9. .init segment is allowed to contain calls to functions only. # a. If function accepts more than 4 arguments *and* >4th argument # is declared as non 64-bit value, do clear its upper part. use strict; my $flavour = shift; my $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } open STDOUT,">$output" || die "can't open $output: $!" if (defined($output)); my $gas=1; $gas=0 if ($output =~ /\.asm$/); my $elf=1; $elf=0 if (!$gas); my $dwarf=$elf; my $win64=0; my $prefix=""; my $decor=".L"; my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 my $masm=0; my $PTR=" PTR"; my $nasmref=2.03; my $nasm=0; if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`; $prefix =~ s|\R$||; # Better chomp } elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } elsif (!$gas) { if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) { $nasm = $1 + $2*0.01; $PTR=""; } elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) { $masm = $1 + $2*2**-16 + $4*2**-32; } die "no assembler found on %PATH%" if (!($nasm || $masm)); $win64=1; $elf=0; $decor="\$L\$"; } my $colon= $masm ? "::" : ":"; $dwarf=0 if($win64); my $current_segment; my $current_function; my %globals; my $ret_clobber; { package opcode; # pick up opcodes sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ /^([a-z][a-z0-9]*)/i) { bless $self,$class; $self->{op} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; undef $self->{sz}; if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... $self->{op} = $1; $self->{sz} = $2; } elsif ($self->{op} =~ /cmov[n]?[lb]$/) { # pass through } elsif ($self->{op} =~ /call|jmp/) { $self->{sz} = ""; } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn $self->{sz} = ""; } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov $self->{sz} = ""; } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { $self->{sz} = ""; } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { $self->{op} = $1; $self->{sz} = $2; } } $ret; } sub size { my ($self, $sz) = @_; $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); $self->{sz}; } sub out { my $self = shift; if ($gas) { if ($self->{op} eq "movz") { # movz is pain... sprintf "%s%s%s",$self->{op},$self->{sz},shift; } elsif ($self->{op} =~ /^set/) { "$self->{op}"; } elsif ($self->{op} eq "ret") { my $epilogue = ""; my $reg = $ret_clobber || "rdx"; $ret_clobber = undef; if ($win64 && $current_function->{abi} eq "svr4" && !$current_function->{unwind}) { $epilogue = "movq 8(%rsp),%rdi\n\t" . "movq 16(%rsp),%rsi\n\t"; } $epilogue . "\n#ifdef __SGX_LVI_HARDENING__\n". " popq %$reg\n" . " lfence\n" . " jmpq *%$reg\n" . " ud2\n" . "#else\n" . " .byte 0xf3,0xc3\n" . "#endif"; } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { ".p2align\t3\n\t.quad"; } else { "$self->{op}$self->{sz}"; } } else { $self->{op} =~ s/^movz/movzx/; if ($self->{op} eq "ret") { $self->{op} = ""; my $reg = $ret_clobber || "rdx"; $ret_clobber = undef; if ($win64 && $current_function->{abi} eq "svr4" && !$current_function->{unwind}) { $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; } $self->{op} .= "\nifdef __SGX_LVI_HARDENING__\n". " pop $reg\n" . " lfence\n" . " jmp $reg\n" . " ud2\n" . "else\n" . " DB\t0F3h,0C3h\n" . "endif"; } elsif ($self->{op} =~ /^(pop|push)f/) { $self->{op} .= $self->{sz}; } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { $self->{op} = "\tDQ"; } $self->{op}; } } sub mnemonic { my ($self, $op) = @_; $self->{op}=$op if (defined($op)); $self->{op}; } } { package const; # pick up constants, which start with $ sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ /^\$([^,]+)/) { bless $self, $class; $self->{value} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; } $ret; } sub out { my $self = shift; $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; if ($gas) { # Solaris /usr/ccs/bin/as can't handle multiplications # in $self->{value} my $value = $self->{value}; no warnings; # oct might complain about overflow, ignore here... $value =~ s/(?{value} = $value; } sprintf "\$%s",$self->{value}; } else { my $value = $self->{value}; $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); sprintf "%s",$value; } } } { package ea; # pick up effective addresses: expr(%reg,%reg,scale) my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", d=>"DWORD$PTR", q=>"QWORD$PTR", o=>"OWORD$PTR", x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", z=>"ZMMWORD$PTR" ) if (!$gas); my %sifmap = ( ss=>"d", sd=>"q", # broadcast only i32x2=>"q", f32x2=>"q", i32x4=>"x", i64x2=>"x", i128=>"x", f32x4=>"x", f64x2=>"x", f128=>"x", i32x8=>"y", i64x4=>"y", f32x8=>"y", f64x4=>"y" ) if (!$gas); sub re { my ($class, $line, $opcode) = @_; my $self = {}; my $ret; # optional * ----vvv--- appears in indirect jmp/call if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,\s]+)\)((?:{[^}]+})*)/) { bless $self, $class; $self->{asterisk} = $1; $self->{label} = $2; ($self->{base},$self->{index},$self->{scale})=split(/(?:,\s*)/,$3); $self->{scale} = 1 if (!defined($self->{scale})); $self->{opmask} = $4; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { die if ($opcode->mnemonic() ne "mov"); $opcode->mnemonic("lea"); } $self->{base} =~ s/^%//; $self->{index} =~ s/^%// if (defined($self->{index})); $self->{opcode} = $opcode; } $ret; } sub size {} sub out { my ($self, $sz) = @_; $self->{label} =~ s/([_a-z][_a-z0-9\$]*)/$globals{$1} or $1/gei; $self->{label} =~ s/\.L/$decor/g; # Silently convert all EAs to 64-bit. This is required for # elder GNU assembler and results in more compact code, # *but* most importantly AES module depends on this feature! $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; # Solaris /usr/ccs/bin/as can't handle multiplications # in $self->{label}... use integer; $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; # Some assemblers insist on signed presentation of 32-bit # offsets, but sign extension is a tricky business in perl... $self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg; # if base register is %rbp or %r13, see if it's possible to # flip base and index registers [for better performance] if (!$self->{label} && $self->{index} && $self->{scale}==1 && $self->{base} =~ /(rbp|r13)/) { $self->{base} = $self->{index}; $self->{index} = $1; } if ($gas) { $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); if (defined($self->{index})) { sprintf "%s%s(%s,%%%s,%d)%s", $self->{asterisk},$self->{label}, $self->{base}?"%$self->{base}":"", $self->{index},$self->{scale}, $self->{opmask}; } else { sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, $self->{base},$self->{opmask}; } } else { $self->{label} =~ s/\./\$/g; $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); my $mnemonic = $self->{opcode}->mnemonic(); ($self->{asterisk}) && ($sz="q") || ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/) && ($sz=$sifmap{$1}); $self->{opmask} =~ s/%(k[0-7])/$1/; if (defined($self->{index})) { sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, $self->{label}?"$self->{label}+":"", $self->{index},$self->{scale}, $self->{base}?"+$self->{base}":"", $self->{opmask}; } elsif ($self->{base} eq "rip") { sprintf "%s[%s]",$szmap{$sz},$self->{label}; } else { sprintf "%s[%s%s]%s", $szmap{$sz}, $self->{label}?"$self->{label}+":"", $self->{base},$self->{opmask}; } } } } { package register; # pick up registers, which start with %. sub re { my ($class, $line, $opcode) = @_; my $self = {}; my $ret; # optional * ----vvv--- appears in indirect jmp/call if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { bless $self,$class; $self->{asterisk} = $1; $self->{value} = $2; $self->{opmask} = $3; $opcode->size($self->size()); $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; } $ret; } sub size { my $self = shift; my $ret; if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } $ret; } sub out { my $self = shift; if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, $self->{value}, $self->{opmask}; } else { $self->{opmask} =~ s/%(k[0-7])/$1/; $self->{value}.$self->{opmask}; } } } { package label; # pick up labels, which end with : sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ /(^[\.\w\$]+)\:/) { bless $self,$class; $self->{value} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; $self->{value} =~ s/^(\w+\$\w*)/$decor\1/ if ($flavour eq "macosx"); $self->{value} =~ s/^\.L/$decor/; } $ret; } sub win64_args { my $narg = $current_function->{narg} // 6; return undef if ($narg < 0); my $arg5 = 4*8 - cfi_directive::cfa_rsp(); my $arg6 = $arg5 + 8; my $args; if ($gas) { $args .= " movq %rcx,%rdi\n" if ($narg>0); $args .= " movq %rdx,%rsi\n" if ($narg>1); $args .= " movq %r8,%rdx\n" if ($narg>2); $args .= " movq %r9,%rcx\n" if ($narg>3); $args .= " movq $arg5(%rsp),%r8\n" if ($narg>4); $args .= " movq $arg6(%rsp),%r9\n" if ($narg>5); } else { $args .= " mov rdi,rcx\n" if ($narg>0); $args .= " mov rsi,rdx\n" if ($narg>1); $args .= " mov rdx,r8\n" if ($narg>2); $args .= " mov rcx,r9\n" if ($narg>3); $args .= " mov r8,QWORD$PTR\[$arg5+rsp\]\n" if ($narg>4); $args .= " mov r9,QWORD$PTR\[$arg6+rsp\]\n" if ($narg>5); } $current_function->{narg} = -1; $args; } sub out { my $self = shift; if ($gas) { my $func = ($globals{$self->{value}} or $self->{value}) . ":"; if ($current_function->{name} eq $self->{value}) { $current_function->{pc} = 0; $func .= "\n.cfi_".cfi_directive::startproc() if ($dwarf); $func .= "\n .byte 0xf3,0x0f,0x1e,0xfa\n"; # endbranch if ($win64) { if ($current_function->{abi} eq "svr4") { my $fp = $current_function->{unwind} ? "%r11" : "%rax"; $func .= " movq %rdi,8(%rsp)\n"; $func .= " movq %rsi,16(%rsp)\n"; $func .= " movq %rsp,$fp\n"; $func .= "${decor}SEH_begin_$current_function->{name}:\n"; } elsif ($current_function->{unwind}) { $func .= " movq %rsp,%r11\n"; $func .= "${decor}SEH_begin_$current_function->{name}:\n"; } } } elsif ($win64 && $current_function->{abi} eq "svr4" && $current_function->{pc} >= 0) { $func = win64_args().$func; } $func; } elsif ($self->{value} ne "$current_function->{name}") { my $func; if ($win64 && $current_function->{abi} eq "svr4" && $current_function->{pc} >= 0) { $func = win64_args(); } $func .= $self->{value} . $colon; $func; } else { $current_function->{pc} = 0; my $func = "$current_function->{name}" . ($nasm ? ":" : "\tPROC $current_function->{scope}") . "\n"; $func .= " DB 243,15,30,250\n"; # endbranch if ($current_function->{abi} eq "svr4") { my $fp = $current_function->{unwind} ? "r11" : "rax"; $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; $func .= " mov $fp,rsp\n"; $func .= "${decor}SEH_begin_$current_function->{name}${colon}\n"; } elsif ($current_function->{unwind}) { $func .= " mov r11,rsp\n"; $func .= "${decor}SEH_begin_$current_function->{name}${colon}\n"; } $func; } } } { package expr; # pick up expressions sub re { my ($class, $line, $opcode) = @_; my $self = {}; my $ret; if ($$line =~ /(^[^,]+)/) { bless $self,$class; $self->{value} = $1; $ret = $self; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; $self->{value} =~ s/\@PLT// if (!$elf); $self->{value} =~ s/([_a-z][_a-z0-9\$]*)/$globals{$1} or $1/gei; if ($flavour eq "macosx" and $self->{value} !~ /\.L/) { $self->{value} =~ s/(\w+\$\w*)/$decor\1/g; } $self->{value} =~ s/\.L/$decor/g; $self->{opcode} = $opcode; } $ret; } sub out { my $self = shift; $self->{value}; } } my @xdata_seg = (".section .xdata", ".align 8"); my @pdata_seg = (".section .pdata", ".align 4"); { package cfi_directive; # CFI directives annotate instructions that are significant for # stack unwinding procedure compliant with DWARF specification, # see http://dwarfstd.org/. Besides naturally expected for this # script platform-specific filtering function, this module adds # four auxiliary synthetic directives not recognized by [GNU] # assembler: # # - .cfi_push to annotate push instructions in prologue, which # translates to .cfi_adjust_cfa_offset (if needed) and # .cfi_offset; # - .cfi_pop to annotate pop instructions in epilogue, which # translates to .cfi_adjust_cfa_offset (if needed) and # .cfi_restore; # - .cfi_alloca to annotate stack pointer adjustments, which # translates to .cfi_adjust_cfa_offset as needed; # - [and most notably] .cfi_cfa_expression which encodes # DW_CFA_def_cfa_expression and passes it to .cfi_escape as # byte vector; # # CFA expressions were introduced in DWARF specification version # 3 and describe how to deduce CFA, Canonical Frame Address. This # becomes handy if your stack frame is variable and you can't # spare register for [previous] frame pointer. Suggested directive # syntax is made-up mix of DWARF operator suffixes [subset of] # and references to registers with optional bias. Following example # describes offloaded *original* stack pointer at specific offset # from *current* stack pointer: # # .cfi_cfa_expression %rsp+40,deref,+8 # # Final +8 has everything to do with the fact that CFA is defined # as reference to top of caller's stack, and on x86_64 call to # subroutine pushes 8-byte return address. In other words original # stack pointer upon entry to a subroutine is 8 bytes off from CFA. # # In addition the .cfi directives are re-purposed even for Win64 # stack unwinding. Two more synthetic directives were added: # # - .cfi_end_prologue to denote point when all non-volatile # registers are saved and stack or [chosen] frame pointer is # stable; # - .cfi_epilogue to denote point when all non-volatile registers # are restored [and it even adds missing .cfi_restore-s]; # # Though it's not universal "miracle cure," it has its limitations. # Most notably .cfi_cfa_expression won't start working... For more # information see the end of this file. # Below constants are taken from "DWARF Expressions" section of the # DWARF specification, section is numbered 7.7 in versions 3 and 4. my %DW_OP_simple = ( # no-arg operators, mapped directly deref => 0x06, dup => 0x12, drop => 0x13, over => 0x14, pick => 0x15, swap => 0x16, rot => 0x17, xderef => 0x18, abs => 0x19, and => 0x1a, div => 0x1b, minus => 0x1c, mod => 0x1d, mul => 0x1e, neg => 0x1f, not => 0x20, or => 0x21, plus => 0x22, shl => 0x24, shr => 0x25, shra => 0x26, xor => 0x27, ); my %DW_OP_complex = ( # used in specific subroutines constu => 0x10, # uleb128 consts => 0x11, # sleb128 plus_uconst => 0x23, # uleb128 lit0 => 0x30, # add 0-31 to opcode reg0 => 0x50, # add 0-31 to opcode breg0 => 0x70, # add 0-31 to opcole, sleb128 regx => 0x90, # uleb28 fbreg => 0x91, # sleb128 bregx => 0x92, # uleb128, sleb128 piece => 0x93, # uleb128 ); # Following constants are defined in x86_64 ABI supplement, for # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, # see section 3.7 "Stack Unwind Algorithm". my %DW_reg_idx = ( "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 ); my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs); my @cfa_stack; sub cfa_rsp { return $cfa_rsp // -8; } # [us]leb128 format is variable-length integer representation base # 2^128, with most significant bit of each byte being 0 denoting # *last* most significant digit. See "Variable Length Data" in the # DWARF specification, numbered 7.6 at least in versions 3 and 4. sub sleb128 { use integer; # get right shift extend sign my $val = shift; my $sign = ($val < 0) ? -1 : 0; my @ret = (); while(1) { push @ret, $val&0x7f; # see if remaining bits are same and equal to most # significant bit of the current digit, if so, it's # last digit... last if (($val>>6) == $sign); @ret[-1] |= 0x80; $val >>= 7; } return @ret; } sub uleb128 { my $val = shift; my @ret = (); while(1) { push @ret, $val&0x7f; # see if it's last significant digit... last if (($val >>= 7) == 0); @ret[-1] |= 0x80; } return @ret; } sub const { my $val = shift; if ($val >= 0 && $val < 32) { return ($DW_OP_complex{lit0}+$val); } return ($DW_OP_complex{consts}, sleb128($val)); } sub reg { my $val = shift; return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); my $reg = $DW_reg_idx{$1}; my $off = eval ("0 $2 $3"); return (($DW_OP_complex{breg0} + $reg), sleb128($off)); # Yes, we use DW_OP_bregX+0 to push register value and not # DW_OP_regX, because latter would require even DW_OP_piece, # which would be a waste under the circumstances. If you have # to use DWP_OP_reg, use "regx:N"... } sub cfa_expression { my $line = shift; my @ret; foreach my $token (split(/,\s*/,$line)) { if ($token =~ /^%r/) { push @ret,reg($token); } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { push @ret,reg("$2+$1"); } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { my $i = 1*eval($2); push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); } elsif (my $i = 1*eval($token) or $token eq "0") { if ($token =~ /^\+/) { push @ret,$DW_OP_complex{plus_uconst},uleb128($i); } else { push @ret,const($i); } } else { push @ret,$DW_OP_simple{$token}; } } # Finally we return DW_CFA_def_cfa_expression, 15, followed by # length of the expression and of course the expression itself. return (15,scalar(@ret),@ret); } # Following constants are defined in "x64 exception handling" at # https://docs.microsoft.com/ and match the register sequence in # CONTEXT structure defined in winnt.h. my %WIN64_reg_idx = ( "%rax"=>0, "%rcx"=>1, "%rdx"=>2, "%rbx"=>3, "%rsp"=>4, "%rbp"=>5, "%rsi"=>6, "%rdi"=>7, "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 ); sub xdata { our @dat = (); our $len = 0; sub savereg { my ($key, $offset) = @_; if ($key =~ /%xmm([0-9]+)/) { if ($offset < 0x100000) { push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))]; } else { push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))]; } } else { if ($offset < 0x80000) { push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4, unpack("C2",pack("v",$offset>>3))]; } else { push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5, unpack("C4",pack("V",$offset))]; } } $len += $#{@dat[-1]}+1; } my $fp_info = 0; # allocate stack frame if ($cfa_rsp < -8) { my $offset = -8 - $cfa_rsp; if ($cfa_reg ne "%rsp" && $saved_regs{$cfa_reg} == -16) { $fp_info = $WIN64_reg_idx{$cfa_reg}; push @dat, [0,$fp_info<<4]; # UWOP_PUSH_NONVOL $len += $#{@dat[-1]}+1; $offset -= 8; } if ($offset <= 128) { my $alloc = ($offset - 8) >> 3; push @dat, [0,$alloc<<4|2]; # UWOP_ALLOC_SMALL } elsif ($offset < 0x80000) { push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))]; } else { push @dat, [0,0x11,unpack("C4",pack("V",$offset))]; } $len += $#{@dat[-1]}+1; } # save frame pointer [if not pushed already] if ($cfa_reg ne "%rsp" && $fp_info == 0) { $fp_info = $WIN64_reg_idx{$cfa_reg}; if (defined(my $offset = $saved_regs{$cfa_reg})) { $offset -= $cfa_rsp; savereg($cfa_reg, $offset); } } # set up frame pointer if ($fp_info) { push @dat, [0,($fp_info<<4)|3]; # UWOP_SET_FPREG $len += $#{@dat[-1]}+1; my $fp_off = $cfa_off - $cfa_rsp; ($fp_off > 240 or $fp_off&0xf) and die "invalid FP offset $fp_off"; $fp_info |= $fp_off&-16; } # save registers foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} } keys(%saved_regs)) { next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key); my $offset = $saved_regs{$key} - $cfa_rsp; savereg($key, $offset); } my @ret; # generate 4-byte descriptor push @ret, ".byte 1,0,".($len/2).",$fp_info"; $len += 4; # keep objdump happy, pad to 4*n and add a 32-bit zero unshift @dat, [(0)x(((-$len)&3)+4)]; $len += $#{@dat[0]}+1; # pad to 8*n unshift @dat, [(0)x((-$len)&7)] if ($len&7); # emit data while(defined(my $row = pop @dat)) { push @ret, ".byte ". join(",", map { sprintf "0x%02x",$_ } @{$row}); } return @ret; } sub startproc { return if ($cfa_rsp == -8); ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8); %saved_regs = (); return "startproc"; } sub endproc { return if ($cfa_rsp == 0); ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0); %saved_regs = (); return "endproc"; } sub re { my ($class, $line) = @_; my $self = {}; my $ret; if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { bless $self,$class; $ret = $self; undef $self->{value}; my $dir = $1; SWITCH: for ($dir) { # What is $cfa_rsp? Effectively it's difference between %rsp # value and current CFA, Canonical Frame Address, which is # why it starts with -8. Recall that CFA is top of caller's # stack... /startproc/ && do { $dir = startproc(); last; }; /endproc/ && do { $dir = endproc(); # .cfi_remember_state directives that are not # matched with .cfi_restore_state are # unnecessary. die "unpaired .cfi_remember_state" if (@cfa_stack); last; }; /def_cfa_register/ && do { $cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp"); $cfa_reg = $$line; $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); last; }; /def_cfa_offset/ && do { $cfa_off = -1*eval($$line); $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); last; }; /adjust_cfa_offset/ && do { my $val = 1*eval($$line); $cfa_off -= $val; if ($cfa_reg eq "%rsp") { $cfa_rsp -= $val; } last; }; /alloca/ && do { $dir = undef; my $val = 1*eval($$line); $cfa_rsp -= $val; if ($cfa_reg eq "%rsp") { $cfa_off -= $val; $dir = "adjust_cfa_offset"; } last; }; /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*(?:,\s*(.+))?/) { $cfa_reg = $1; if ($cfa_reg eq "%rsp" && !defined($2)) { $cfa_off = $cfa_rsp; $$line .= ",".(-$cfa_rsp); } else { $cfa_off = -1*eval($2); $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); } } last; }; /push/ && do { $dir = undef; $cfa_rsp -= 8; if ($cfa_reg eq "%rsp") { $cfa_off = $cfa_rsp; $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; } $saved_regs{$$line} = $cfa_rsp; $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; last; }; /pop/ && do { $dir = undef; $cfa_rsp += 8; if ($cfa_reg eq "%rsp") { $cfa_off = $cfa_rsp; $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; } $self->{value} .= ".cfi_restore\t$$line"; delete $saved_regs{$$line}; last; }; /cfa_expression/ && do { $dir = undef; $self->{value} = ".cfi_escape\t" . join(",", map(sprintf("0x%02x", $_), cfa_expression($$line))); last; }; /remember_state/ && do { push @cfa_stack, [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs]; last; }; /restore_state/ && do { ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs) = @{pop @cfa_stack}; last; }; /offset/ && do { if ($$line =~ /(%\w+)(?:-%xmm(\d+))?\s*,\s*(.+)/) { my ($reg, $off, $xmmlast) = ($1, 1*eval($3), $2); if ($reg !~ /%xmm(\d+)/) { $saved_regs{$reg} = $off; } else { $dir = undef; $xmmlast //= $1; for (my $i=$1; $i<=$xmmlast; $i++) { $saved_regs{"%xmm$i"} = $off; $off += 16; } } } last; }; /restore/ && do { delete $saved_regs{$$line}; last; }; /end_prologue/ && do { $dir = undef; $self->{win64} = ".endprolog"; last; }; /epilogue/ && do { $dir = undef; $self->{win64} = ".epilogue"; $self->{value} = join("\n", map { ".cfi_restore\t$_" } sort keys(%saved_regs)); %saved_regs = (); last; }; } $self->{value} = ".cfi_$dir\t$$line" if ($dir); $$line = ""; } return $ret; } sub out { my $self = shift; return $self->{value} if ($dwarf); if ($win64 and $current_function->{unwind} and my $ret = $self->{win64}) { my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/) ? ($', $cfa_off) : ("rsp", $cfa_rsp); my $fname = $current_function->{name}; if ($ret eq ".endprolog") { $ret = ""; if ($current_function->{abi} eq "svr4") { $ret .= label::win64_args(); $saved_regs{"%rdi"} = 0; # relative to CFA, remember? $saved_regs{"%rsi"} = 8; } push @pdata_seg, ".rva .LSEH_begin_${fname}", ".rva .LSEH_body_${fname}", ".rva .LSEH_info_${fname}_prologue",""; push @xdata_seg, ".LSEH_info_${fname}_prologue:"; if ($current_function->{unwind} eq "%rbp") { if ($current_function->{abi} eq "svr4") { push @xdata_seg, ".byte 1,4,6,0x05", # 6 unwind codes, %rbp is FP ".byte 4,0x74,2,0", # %rdi at 16(%rsp) ".byte 4,0x64,3,0", # %rsi at 24(%rsp) ".byte 4,0x53", # mov %rsp, %rbp ".byte 1,0x50", # push %rbp ".long 0,0" # pad to keep objdump happy ; } else { push @xdata_seg, ".byte 1,4,2,0x05", # 2 unwind codes, %rbp is FP ".byte 4,0x53", # mov %rsp, %rbp ".byte 1,0x50", # push %rbp ".long 0,0" # pad to keep objdump happy ; } } else { if ($current_function->{abi} eq "svr4") { push @xdata_seg, ".byte 1,0,5,0x0b", # 5 unwind codes, %r11 is FP ".byte 0,0x74,1,0", # %rdi at 8(%rsp) ".byte 0,0x64,2,0", # %rsi at 16(%rsp) ".byte 0,0xb3", # set frame pointer ".byte 0,0", # padding ".long 0,0" # pad to keep objdump happy ; } else { push @xdata_seg, ".byte 1,0,1,0x0b", # 1 unwind code, %r11 is FP ".byte 0,0xb3", # set frame pointer ".byte 0,0", # padding ".long 0,0" # pad to keep objdump happy ; } } push @pdata_seg, ".rva .LSEH_body_${fname}", ".rva .LSEH_epilogue_${fname}", ".rva .LSEH_info_${fname}_body",""; push @xdata_seg,".LSEH_info_${fname}_body:", xdata(); $ret .= "${decor}SEH_body_${fname}${colon}\n"; } elsif ($ret eq ".epilogue") { %saved_regs = (); $cfa_rsp = $cfa_off; $ret = "${decor}SEH_epilogue_${fname}${colon}\n"; if ($current_function->{abi} eq "svr4") { $saved_regs{"%rdi"} = 0; # relative to CFA, remember? $saved_regs{"%rsi"} = 8; push @pdata_seg, ".rva .LSEH_epilogue_${fname}", ".rva .LSEH_end_${fname}", ".rva .LSEH_info_${fname}_epilogue",""; push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), ""; if ($gas) { $ret .= " mov ".(0-$off)."(%$reg),%rdi\n"; $ret .= " mov ".(8-$off)."(%$reg),%rsi\n"; } else { $ret .= " mov rdi,QWORD$PTR\[".(0-$off)."+$reg\]"; $ret .= " ;WIN64 epilogue\n"; $ret .= " mov rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n"; } } } return $ret; } return; } } { package directive; # pick up directives, which start with . sub re { my ($class, $line) = @_; my $self = {}; my $ret; my $dir; # chain-call to cfi_directive $ret = cfi_directive->re($line) and return $ret; if ($$line =~ /^\s*(\.\w+)/) { bless $self,$class; $dir = $1; $ret = $self; undef $self->{value}; $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; SWITCH: for ($dir) { /\.global|\.globl|\.extern|\.comm/ && do { $$line =~ s/([_a-z][_a-z0-9\$]*)/$prefix\1/gi; $globals{$1} = $prefix.$1 if ($1); last; }; /\.type/ && do { my ($sym,$type,$narg,$unwind) = split(',',$$line); if ($type eq "\@function") { undef $current_function; $current_function->{name} = $sym; $current_function->{abi} = "svr4"; $current_function->{narg} = $narg; $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; $current_function->{unwind} = $unwind; $current_function->{pc} = -1; } elsif ($type eq "\@abi-omnipotent") { undef $current_function; $current_function->{name} = $sym; $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; $current_function->{unwind} = $unwind; $current_function->{pc} = -1; } $$line =~ s/\@abi\-omnipotent/\@function/; $$line =~ s/\@function.*/\@function/; last; }; /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { $dir = ".byte"; $$line = join(",",unpack("C*",$1),0); } last; }; /\.rva|\.long|\.quad/ && do { $$line =~ s/([_a-z][_a-z0-9\$]*)/$globals{$1} or $1/gei; $$line =~ s/\.L/$decor/g; last; }; } if ($gas) { $self->{value} = $dir . "\t" . $$line; if ($dir =~ /\.extern/) { $self->{value} = ""; # swallow extern } elsif (!$elf && $dir =~ /\.type/) { $self->{value} = ""; $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . (defined($globals{$1})?".scl 2;":".scl 3;") . "\t.type 32;\t.endef" if ($win64 && $$line =~ /([^,]+),\@function/); } elsif ($dir =~ /\.size/) { $self->{value} = "" if (!$elf); if ($dwarf and my $endproc = cfi_directive::endproc()) { $self->{value} = ".cfi_$endproc\n$self->{value}"; } elsif (!$elf && defined($current_function)) { $self->{value} .= "${decor}SEH_end_$current_function->{name}:" if ($win64 && $current_function->{abi} eq "svr4"); undef $current_function; } } elsif (!$elf && $dir =~ /\.align/) { $self->{value} = ".p2align\t" . (log($$line)/log(2)); } elsif ($dir eq ".section") { $current_segment=$$line; if (!$elf && $current_segment eq ".init") { if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } } if (!$elf && $current_segment eq ".rodata") { if ($flavour eq "macosx") { $self->{value} = ".section\t__TEXT,__const"; } elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.rdata"; } } } elsif ($dir =~ /\.(text|data)/) { $current_segment=".$1"; } elsif ($dir =~ /\.hidden/) { if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } elsif ($flavour eq "mingw64") { $self->{value} = ""; } } elsif ($dir =~ /\.comm/) { $self->{value} = "$dir\t$$line"; $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); } $$line = ""; return $self; } # non-gas case or nasm/masm SWITCH: for ($dir) { /\.text/ && do { my $v=undef; if ($nasm) { $v="section .text code align=64\n"; } else { $v="$current_segment\tENDS\n" if ($current_segment); $current_segment = ".text\$"; $v.="$current_segment\tSEGMENT "; $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; $v.=" 'CODE'"; } $self->{value} = $v; last; }; /\.data/ && do { my $v=undef; if ($nasm) { $v="section .data data align=8\n"; } else { $v="$current_segment\tENDS\n" if ($current_segment); $current_segment = "_DATA"; $v.="$current_segment\tSEGMENT"; } $self->{value} = $v; last; }; /\.section/ && do { my $v=undef; $$line =~ s/([^,]*).*/$1/; $$line = ".CRT\$XCU" if ($$line eq ".init"); $$line = ".rdata" if ($$line eq ".rodata"); my %align = ( p=>4, x=>8, r=>256); if ($nasm) { $v="section $$line"; if ($$line=~/\.([pxr])data/) { $v.=" rdata align=$align{$1}"; } elsif ($$line=~/\.CRT\$/i) { $v.=" rdata align=8"; } } else { $v="$current_segment\tENDS\n" if ($current_segment); $v.="$$line\tSEGMENT"; if ($$line=~/\.([pxr])data/) { $v.=" READONLY"; $v.=" ALIGN($align{$1})" if ($masm>=$masmref); } elsif ($$line=~/\.CRT\$/i) { $v.=" READONLY "; $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; } } $current_segment = $$line; $self->{value} = $v; last; }; /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; $self->{value} .= ":NEAR" if ($masm); last; }; /\.globl|.global/ && do { $self->{value} = $masm?"PUBLIC":"global"; $self->{value} .= "\t".$$line; last; }; /\.size/ && do { if (defined($current_function)) { undef $self->{value}; if ($current_function->{abi} eq "svr4") { $self->{value}="${decor}SEH_end_$current_function->{name}${colon}\n"; } $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); undef $current_function; } last; }; /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); last; }; /\.(value|long|rva|quad)/ && do { my $sz = substr($1,0,1); my @arr = split(/,\s*/,$$line); my $last = pop(@arr); my $conv = sub { my $var=shift; $var=~s/^(0b[0-1]+)/oct($1)/eig; $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } $var; }; $sz =~ tr/bvlrq/BWDDQ/; $self->{value} = "\tD$sz\t"; for (@arr) { $self->{value} .= &$conv($_).","; } $self->{value} .= &$conv($last); last; }; /\.byte/ && do { my @str=split(/,\s*/,$$line); map(s/(0b[0-1]+)/oct($1)/eig,@str); map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); while ($#str>15) { $self->{value}.="DB\t" .join(",",@str[0..15])."\n"; foreach (0..15) { shift @str; } } $self->{value}.="DB\t" .join(",",@str) if (@str); last; }; /\.comm/ && do { my @str=split(/,\s*/,$$line); my $v=undef; if ($nasm) { $v.="common $prefix@str[0] @str[1]"; } else { $v="$current_segment\tENDS\n" if ($current_segment); $current_segment = "_DATA"; $v.="$current_segment\tSEGMENT\n"; $v.="COMM @str[0]:DWORD:".@str[1]/4; } $self->{value} = $v; last; }; } $$line = ""; } $ret; } sub out { my $self = shift; $self->{value}; } } # Upon initial x86_64 introduction SSE>2 extensions were not introduced # yet. In order not to be bothered by tracing exact assembler versions, # but at the same time to provide a bare security minimum of AES-NI, we # hard-code some instructions. Extensions past AES-NI on the other hand # are traced by examining assembler version in individual perlasm # modules... my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); sub rex { my $opcode=shift; my ($dst,$src,$rex)=@_; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); push @$opcode,($rex|0x40) if ($rex); } my $movq = sub { # elderly gas can't handle inter-register movq my $arg = shift; my @opcode=(0x66); if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { my ($src,$dst)=($1,$2); if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,$src,$dst,0x8); push @opcode,0x0f,0x7e; push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M @opcode; } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { my ($src,$dst)=($2,$1); if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,$src,$dst,0x8); push @opcode,0x0f,0x6e; push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M @opcode; } else { (); } }; my $pextrd = sub { if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { my @opcode=(0x66); my $imm=$1; my $src=$2; my $dst=$3; if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } rex(\@opcode,$src,$dst); push @opcode,0x0f,0x3a,0x16; push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M push @opcode,$imm; @opcode; } else { (); } }; my $pinsrd = sub { if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); my $imm=$1; my $src=$2; my $dst=$3; if ($src =~ /%r([0-9]+)/) { $src = $1; } elsif ($src =~ /%e/) { $src = $regrm{$src}; } rex(\@opcode,$dst,$src); push @opcode,0x0f,0x3a,0x22; push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M push @opcode,$imm; @opcode; } else { (); } }; my $pshufb = sub { if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); rex(\@opcode,$2,$1); push @opcode,0x0f,0x38,0x00; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M @opcode; } else { (); } }; my $palignr = sub { if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); rex(\@opcode,$3,$2); push @opcode,0x0f,0x3a,0x0f; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M push @opcode,$1; @opcode; } else { (); } }; my $pclmulqdq = sub { if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x66); rex(\@opcode,$3,$2); push @opcode,0x0f,0x3a,0x44; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; @opcode; } else { (); } }; my $rdrand = sub { if (shift =~ /%[er](\w+)/) { my @opcode=(); my $dst=$1; if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,0,$dst,8); push @opcode,0x0f,0xc7,0xf0|($dst&7); @opcode; } else { (); } }; my $rdseed = sub { if (shift =~ /%[er](\w+)/) { my @opcode=(); my $dst=$1; if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } rex(\@opcode,0,$dst,8); push @opcode,0x0f,0xc7,0xf8|($dst&7); @opcode; } else { (); } }; # Not all AVX-capable assemblers recognize AMD XOP extension. Since we # are using only two instructions hand-code them in order to be excused # from chasing assembler versions... sub rxb { my $opcode=shift; my ($dst,$src1,$src2,$rxb)=@_; $rxb|=0x7<<5; $rxb&=~(0x04<<5) if($dst>=8); $rxb&=~(0x01<<5) if($src1>=8); $rxb&=~(0x02<<5) if($src2>=8); push @$opcode,$rxb; } my $vprotd = sub { if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x8f); rxb(\@opcode,$3,$2,-1,0x08); push @opcode,0x78,0xc2; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; @opcode; } else { (); } }; my $vprotq = sub { if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x8f); rxb(\@opcode,$3,$2,-1,0x08); push @opcode,0x78,0xc3; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; @opcode; } else { (); } }; # Intel Control-flow Enforcement Technology extension. All functions and # indirect branch targets will have to start with this instruction... # However, it should not be used in functions' prologues explicitly, as # it's added automatically [and in the right spot]. Which leaves only # non-function indirect branch targets, such as in a case-like dispatch # table, as application area. my $endbr64 = sub { (0xf3,0x0f,0x1e,0xfa); }; ######################################################################## my $preproc_prefix = "#"; if ($nasm) { $preproc_prefix = "%"; print <<___; default rel %define XMMWORD %define YMMWORD %define ZMMWORD ___ } elsif ($masm) { $preproc_prefix = ""; print <<___; OPTION DOTNAME ___ } sub process { my $line = shift; $line =~ s|\R$||; # Better chomp if ($line =~ m/^#\s*(if|elif|else|endif)(.*)/) { # pass through preproc if ($win64 && $current_function->{abi} eq "svr4" && $current_function->{narg} >= 0) { print label::win64_args(); } print $preproc_prefix,$1,$2,"\n"; next; } if ($line =~ m|#\s*__SGX_LVI_HARDENING_CLOBBER__=(?:%?(r\w+))|) { $ret_clobber = $1; } $line =~ s|[#!].*$||; # get rid of asm-style comments... $line =~ s|/\*.*\*/||; # ... and C-style comments... $line =~ s|^\s+||; # ... and skip white spaces in beginning $line =~ s|\s+$||; # ... and at the end if (my $label=label->re(\$line)) { print $label->out(); } if (my $directive=directive->re(\$line)) { printf "%s",$directive->out(); } elsif (my $opcode=opcode->re(\$line)) { my $asm = eval("\$".$opcode->mnemonic()); if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; next; } my @args; ARGUMENT: while (1) { my $arg; ($arg=register->re(\$line, $opcode))|| ($arg=const->re(\$line)) || ($arg=ea->re(\$line, $opcode)) || ($arg=expr->re(\$line, $opcode)) || last ARGUMENT; push @args,$arg; last ARGUMENT if ($line !~ /^,/); $line =~ s/^,\s*//; } # ARGUMENT: if ($win64 && $current_function->{abi} eq "svr4" && $current_function->{narg} >= 0) { my $pc = $current_function->{pc}; my $op = $opcode->{op}; my $a0 = @args[0]->{value} if ($#args>=0); if (!$current_function->{unwind} || $pc == 0 && !($op eq "push" && $a0 eq "rbp") || $pc == 1 && !($op eq "mov" && $a0 eq "rsp" && @args[1]->{value} eq "rbp" && ($current_function->{unwind} = "%rbp")) || $pc > 1) { print label::win64_args(); } } if ($#args>=0) { my $insn; my $sz=$opcode->size(); if ($gas) { $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); @args = map($_->out($sz),@args); printf "\t%s\t%s",$insn,join(",",@args); } else { $insn = $opcode->out(); foreach (@args) { my $arg = $_->out(); # $insn.=$sz compensates for movq, pinsrw, ... if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } } @args = reverse(@args); undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); } } else { printf "\t%s",$opcode->out(); } ++$current_function->{pc} if (defined($current_function)); } print $line,"\n"; } while(<>) { process($_); } map { process($_) } @pdata_seg if ($win64 && $#pdata_seg>1); map { process($_) } @xdata_seg if ($win64 && $#xdata_seg>1); # platform-specific epilogue if ($masm) { print "\n$current_segment\tENDS\n" if ($current_segment); print "END\n"; } elsif ($elf) { # -fcf-protection segment, snatched from compiler -S output my $align = ($flavour =~ /elf32/) ? 4 : 8; print <<___; .section .note.GNU-stack,"",\@progbits #ifndef __SGX_LVI_HARDENING__ .section .note.gnu.property,"a",\@note .long 4,2f-1f,5 .byte 0x47,0x4E,0x55,0 1: .long 0xc0000002,4,3 .align $align 2: #endif ___ } close STDOUT; ################################################# # Cross-reference x86_64 ABI "card" # # Unix Win64 # %rax * * # %rbx - - # %rcx #4 #1 # %rdx #3 #2 # %rsi #2 - # %rdi #1 - # %rbp - - # %rsp - - # %r8 #5 #3 # %r9 #6 #4 # %r10 * * # %r11 * * # %r12 - - # %r13 - - # %r14 - - # %r15 - - # # (*) volatile register # (-) preserved by callee # (#) Nth argument, volatile # # In Unix terms top of stack is argument transfer area for arguments # which could not be accommodated in registers. Or in other words 7th # [integer] argument resides at 8(%rsp) upon function entry point. # 128 bytes above %rsp constitute a "red zone" which is not touched # by signal handlers and can be used as temporal storage without # allocating a frame. # # In Win64 terms N*8 bytes on top of stack is argument transfer area, # which belongs to/can be overwritten by callee. N is the number of # arguments passed to callee, *but* not less than 4! This means that # upon function entry point 5th argument resides at 40(%rsp), as well # as that 32 bytes from 8(%rsp) can always be used as temporal # storage [without allocating a frame]. One can actually argue that # one can assume a "red zone" above stack pointer under Win64 as well. # Point is that at apparently no occasion Windows kernel would alter # the area above user stack pointer in true asynchronous manner... # # All the above means that if assembler programmer adheres to Unix # register and stack layout, but disregards the "red zone" existence, # it's possible to use following prologue and epilogue to "gear" from # Unix to Win64 ABI in leaf functions with not more than 6 arguments. # # omnipotent_function: # ifdef WIN64 # movq %rdi,8(%rsp) # movq %rsi,16(%rsp) # movq %rcx,%rdi ; if 1st argument is actually present # movq %rdx,%rsi ; if 2nd argument is actually ... # movq %r8,%rdx ; if 3rd argument is ... # movq %r9,%rcx ; if 4th argument ... # movq 40(%rsp),%r8 ; if 5th ... # movq 48(%rsp),%r9 ; if 6th ... # endif # ... # ifdef WIN64 # movq 8(%rsp),%rdi # movq 16(%rsp),%rsi # endif # ret # ################################################# # Win64 SEH, Structured Exception Handling. # # Unlike on Unix systems(*) lack of Win64 stack unwinding information # has undesired side-effect at run-time: if an exception is raised in # assembler subroutine such as those in question (basically we're # referring to segmentation violations caused by malformed input # parameters), the application is briskly terminated without invoking # any exception handlers, most notably without generating memory dump # or any user notification whatsoever. This poses a problem. It's # possible to address it by registering custom language-specific # handler that would restore processor context to the state at # subroutine entry point and return "exception is not handled, keep # unwinding" code. Writing such handler can be a challenge... But it's # doable, though requires certain coding convention. Consider following # snippet: # # .type function,@function # function: # movq %rsp,%rax # copy rsp to volatile register # pushq %r15 # save non-volatile registers # pushq %rbx # pushq %rbp # movq %rsp,%r11 # subq %rdi,%r11 # prepare [variable] stack frame # andq $-64,%r11 # movq %rax,0(%r11) # check for exceptions # movq %r11,%rsp # allocate [variable] stack frame # movq %rax,0(%rsp) # save original rsp value # magic_point: # ... # movq 0(%rsp),%rcx # pull original rsp value # movq -24(%rcx),%rbp # restore non-volatile registers # movq -16(%rcx),%rbx # movq -8(%rcx),%r15 # movq %rcx,%rsp # restore original rsp # magic_epilogue: # ret # .size function,.-function # # The key is that up to magic_point copy of original rsp value remains # in chosen volatile register and no non-volatile register, except for # rsp, is modified. While past magic_point rsp remains constant till # the very end of the function. In this case custom language-specific # exception handler would look like this: # # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) # { ULONG64 *rsp = (ULONG64 *)context->Rax; # ULONG64 rip = context->Rip; # # if (rip >= magic_point) # { rsp = (ULONG64 *)context->Rsp; # if (rip < magic_epilogue) # { rsp = (ULONG64 *)rsp[0]; # context->Rbp = rsp[-3]; # context->Rbx = rsp[-2]; # context->R15 = rsp[-1]; # } # } # context->Rsp = (ULONG64)rsp; # context->Rdi = rsp[1]; # context->Rsi = rsp[2]; # # memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); # RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, # dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, # &disp->HandlerData,&disp->EstablisherFrame,NULL); # return ExceptionContinueSearch; # } # # It's appropriate to implement this handler in assembler, directly in # function's module. In order to do that one has to know members' # offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant # values. Here they are: # # CONTEXT.Rax 120 # CONTEXT.Rcx 128 # CONTEXT.Rdx 136 # CONTEXT.Rbx 144 # CONTEXT.Rsp 152 # CONTEXT.Rbp 160 # CONTEXT.Rsi 168 # CONTEXT.Rdi 176 # CONTEXT.R8 184 # CONTEXT.R9 192 # CONTEXT.R10 200 # CONTEXT.R11 208 # CONTEXT.R12 216 # CONTEXT.R13 224 # CONTEXT.R14 232 # CONTEXT.R15 240 # CONTEXT.Rip 248 # CONTEXT.Xmm6 512 # sizeof(CONTEXT) 1232 # DISPATCHER_CONTEXT.ControlPc 0 # DISPATCHER_CONTEXT.ImageBase 8 # DISPATCHER_CONTEXT.FunctionEntry 16 # DISPATCHER_CONTEXT.EstablisherFrame 24 # DISPATCHER_CONTEXT.TargetIp 32 # DISPATCHER_CONTEXT.ContextRecord 40 # DISPATCHER_CONTEXT.LanguageHandler 48 # DISPATCHER_CONTEXT.HandlerData 56 # UNW_FLAG_NHANDLER 0 # ExceptionContinueSearch 1 # # In order to tie the handler to the function one has to compose # couple of structures: one for .xdata segment and one for .pdata. # # UNWIND_INFO structure for .xdata segment would be # # function_unwind_info: # .byte 9,0,0,0 # .rva handler # # This structure designates exception handler for a function with # zero-length prologue, no stack frame or frame register. # # To facilitate composing of .pdata structures, auto-generated "gear" # prologue copies rsp value to rax and denotes next instruction with # .LSEH_begin_{function_name} label. This essentially defines the SEH # styling rule mentioned in the beginning. Position of this label is # chosen in such manner that possible exceptions raised in the "gear" # prologue would be accounted to caller and unwound from latter's frame. # End of function is marked with respective .LSEH_end_{function_name} # label. To summarize, .pdata segment would contain # # .rva .LSEH_begin_function # .rva .LSEH_end_function # .rva function_unwind_info # # Reference to function_unwind_info from .xdata segment is the anchor. # In case you wonder why references are 32-bit .rvas and not 64-bit # .quads. References put into these two segments are required to be # *relative* to the base address of the current binary module, a.k.a. # image base. No Win64 module, be it .exe or .dll, can be larger than # 2GB and thus such relative references can be and are accommodated in # 32 bits. # # Having reviewed the example function code, one can argue that "movq # %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix # rax would contain an undefined value. If this "offends" you, use # another register and refrain from modifying rax till magic_point is # reached, i.e. as if it was a non-volatile register. If more registers # are required prior [variable] frame setup is completed, note that # nobody says that you can have only one "magic point." You can # "liberate" non-volatile registers by denoting last stack off-load # instruction and reflecting it in finer grade unwind logic in handler. # After all, isn't it why it's called *language-specific* handler... # # SE handlers are also involved in unwinding stack when executable is # profiled or debugged. Profiling implies additional limitations that # are too subtle to discuss here. For now it's sufficient to say that # in order to simplify handlers one should either a) offload original # %rsp to stack (like discussed above); or b) if you have a register to # spare for frame pointer, choose volatile one. # # (*) Note that we're talking about run-time, not debug-time. Lack of # unwind information makes debugging hard on both Windows and # Unix. "Unlike" refers to the fact that on Unix signal handler # will always be invoked, core dumped and appropriate exit code # returned to parent (for user notification). # ######################################################################## # As of May 2020 an alternative approach that works with both exceptions # and debugging/profiling was implemented by re-purposing DWARF .cfi # annotations even for Win64 unwind tables' generation. Unfortunately, # but not really unexpectedly, it imposes additional limitations on # coding style. Probably the most significant limitation is that the # frame pointer has to be at 16*n distance from the stack pointer at the # exit from prologue. But first things first. There are two additional # synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue, # that need to be added to all functions marked with additional .type # tag (see example below). There are "do's and don'ts" for prologue # and epilogue. It shouldn't come as a surprise that in prologue one may # not modify non-volatile registers, but one may not modify %r11 either. # This is because it's used as a temporary frame pointer(*). There are # two exceptions to this rule. 1) One can set up a non-volatile register # or %r11 as a frame pointer, but it must be last instruction in the # prologue. 2) One can use 'push %rbp' as first instruction immediately # followed by 'mov %rsp,%rbp' to use %rbp as "legacy" frame pointer. # Constraints for epilogue, or rather on its boundary, depend on whether # the frame is fixed- or variable-length. In fixed-frame subroutine # stack pointer has to be restored in the last instruction prior to the # .cfi_epilogue directive. If it's a variable-frame subroutine, and a # non-volatile register was used as a frame pointer, then the last # instruction prior to the directive has to restore its original value. # This means that final stack pointer adjustment would have to be # pushed past the directive. Normally this would render the epilogue # non-unwindable, so special care has to be taken. To resolve the # dilemma, copy the frame pointer to a volatile register in advance. # To give an example: # # .type rbp_as_frame_pointer,\@function,3,"unwind" # mind extra tag! # rbp_as_frame_pointer: # .cfi_startproc # push %rbp # .cfi_push %rbp # push %rbx # .cfi_push %rbx # mov %rsp,%rbp # last instruction in prologue # .cfi_def_cfa_register %rbp # %rsp-%rbp has to be 16*n, e.g. 16*0 # .cfi_end_prologue # sub \$40,%rsp # and \$-64,%rsp # ... # mov %rbp,%r11 # .cfi_def_cfa_register %r11 # copy frame pointer to volatile %r11 # mov 0(%rbp),%rbx # mov 8(%rbp),%rbp # last instruction prior epilogue # .cfi_epilogue # may not change %r11 in epilogue # lea 16(%r11),%rsp # ret # .cfi_endproc # .size rbp_as_frame_pointer,.-rbp_as_frame_pointer # # An example of "legacy" frame pointer: # # .type legacy_frame_pointer,\@function,3,"unwind" # mind extra tag! # legacy_frame_pointer: # .cfi_startproc # push %rbp # .cfi_push %rbp # mov %rsp,%rbp # .cfi_def_cfa_register %rbp # push %rbx # .cfi_push %rbx # sub \$40,%rsp # .cfi_alloca 40 # .cfi_end_prologue # %rsp-%rbp has to be 16*n # and \$-64,%rsp # ... # mov -8(%rbp),%rbx # mov %rbp,%rsp # .cfi_def_cfa_regiser %rsp # pop %rbp # recognized by Windows # .cfi_pop %rbp # .cfi_epilogue # ret # .cfi_endproc # .size legacy_frame_pointer,.-legacy_frame_pointer # # To give an example of fixed-frame subroutine for reference: # # .type fixed_frame,\@function,3,"unwind" # mind extra tag! # fixed_frame: # .cfi_startproc # push %rbp # .cfi_push %rbp # push %rbx # .cfi_push %rbx # sub \$40,%rsp # .cfi_adjust_cfa_offset 40 # .cfi_end_prologue # ... # mov 40(%rsp),%rbx # mov 48(%rsp),%rbp # lea 56(%rsp),%rsp # .cfi_adjust_cfa_offset -56 # .cfi_epilogue # ret # .cfi_endproc # .size fixed_frame,.-fixed_frame # # As for epilogue itself, one can only work on non-volatile registers. # "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi. # # On a final note, mixing old-style and modernized subroutines in the # same file takes some trickery. Ones of the new kind have to appear # after old-style ones. This has everything to do with the fact that # entries in the .pdata segment have to appear in strictly same order # as corresponding subroutines, and auto-generated RUNTIME_FUNCTION # structures get mechanically appended to whatever existing .pdata. # # (*) Just in case, why %r11 and not %rax. This has everything to do # with the way UNWIND_INFO is, one just can't designate %rax as # frame pointer. ================================================ FILE: src/blst_t.hpp ================================================ // Copyright Supranational LLC // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 #ifndef __BLST_T_HPP__ #define __BLST_T_HPP__ /* * These templates, blst_384_t and blst_256_t, allow to instantiate slim * C++ shims to blst assembly with arbitrary moduli. Well, not literally * arbitrary, as there is a limitation, specifically 256-bit modulus has * to be not larger than 2^256-2^192-1. */ #ifdef __GNUC__ # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-function" #endif extern "C" { #include "vect.h" } #include "bytes.h" #undef launder // avoid conflict with C++ >=17 #ifdef __GNUC__ # pragma GCC diagnostic pop #endif #include static inline void vec_left_align(limb_t *out, const limb_t *inp, size_t N) { const unsigned int nbits = sizeof(inp[0])*8; const unsigned int align = (0 - N) % nbits; size_t n = (N + nbits - 1) / nbits; if (align) { limb_t top = inp[n-1] << align; while (--n) { limb_t next = inp[n-1]; out[n] = top | next >> (nbits-align); top = next << align; } out[0] = top; } else { for (size_t i = 0; i < n; i++) out[i] = inp[i]; } } template class blst_384_t { private: vec384 val; inline operator const limb_t*() const { return val; } inline operator limb_t*() { return val; } inline limb_t& operator[](size_t i) { return val[i]; } inline const limb_t& operator[](size_t i) const { return val[i]; } static const size_t n = sizeof(vec384)/sizeof(limb_t); public: static const size_t nbits = N; static constexpr size_t bit_length() { return N; } static const unsigned int degree = 1; typedef byte pow_t[384/8]; typedef blst_384_t mem_t; inline blst_384_t() {} inline blst_384_t(const vec384 p, bool align = false) { if (align) vec_left_align(val, p, N); else vec_copy(val, p, sizeof(val)); } inline blst_384_t(uint64_t a) { vec_zero(val, sizeof(val)); val[0] = a; if (a) to(); } inline blst_384_t(int a) : blst_384_t((uint64_t)a) {} #if defined(__CUDACC__) || defined(__HIPCC__) # if __cplusplus < 201402L && _MSVC_LANG-0 < 201402L # error "C++ >= 14 is required to compile /src/blst_t.hpp for CUDA" # endif template constexpr blst_384_t(limb_t a0, Ts... arr) { limb_t temp[11] = {arr...}; if (sizeof...(arr) < 6) { val[0] = a0; val[1] = temp[0]; val[2] = temp[1]; val[3] = temp[2]; val[4] = temp[3]; val[5] = temp[4]; } else { val[0] = a0 | (temp[0] << 32); val[1] = temp[1] | (temp[2] << 32); val[2] = temp[3] | (temp[4] << 32); val[3] = temp[5] | (temp[6] << 32); val[4] = temp[7] | (temp[8] << 32); val[5] = temp[9] | (temp[10] << 32); } } #else template constexpr blst_384_t(limb_t a0, Ts... arr) : val{a0, arr...} {} #endif inline void to_scalar(pow_t& scalar) const { const union { long one; char little; } is_endian = { 1 }; if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { from_mont_384((limb_t *)scalar, val, MOD, M0); } else { vec384 out; from_mont_384(out, val, MOD, M0); le_bytes_from_limbs(scalar, out, sizeof(pow_t)); vec_zero(out, sizeof(out)); } } static inline const blst_384_t& one() { return *reinterpret_cast(ONE); } static inline blst_384_t one(bool or_zero) { blst_384_t ret; limb_t mask = ~((limb_t)0 - or_zero); for (size_t i = 0; i < n; i++) ret[i] = ONE[i] & mask; return ret; } inline blst_384_t& to() { mul_mont_384(val, RR, val, MOD, M0); return *this; } inline blst_384_t& from() { from_mont_384(val, val, MOD, M0); return *this; } inline void store(limb_t *p) const { vec_copy(p, val, sizeof(val)); } inline blst_384_t& operator+=(const blst_384_t& b) { add_mod_384(val, val, b, MOD); return *this; } friend inline blst_384_t operator+(const blst_384_t& a, const blst_384_t& b) { blst_384_t ret; add_mod_384(ret, a, b, MOD); return ret; } inline blst_384_t& operator<<=(unsigned l) { lshift_mod_384(val, val, l, MOD); return *this; } friend inline blst_384_t operator<<(const blst_384_t& a, unsigned l) { blst_384_t ret; lshift_mod_384(ret, a, l, MOD); return ret; } inline blst_384_t& operator>>=(unsigned r) { rshift_mod_384(val, val, r, MOD); return *this; } friend inline blst_384_t operator>>(const blst_384_t& a, unsigned r) { blst_384_t ret; rshift_mod_384(ret, a, r, MOD); return ret; } inline blst_384_t& operator-=(const blst_384_t& b) { sub_mod_384(val, val, b, MOD); return *this; } friend inline blst_384_t operator-(const blst_384_t& a, const blst_384_t& b) { blst_384_t ret; sub_mod_384(ret, a, b, MOD); return ret; } inline blst_384_t& cneg(bool flag) { cneg_mod_384(val, val, flag, MOD); return *this; } friend inline blst_384_t cneg(const blst_384_t& a, bool flag) { blst_384_t ret; cneg_mod_384(ret, a, flag, MOD); return ret; } friend inline blst_384_t operator-(const blst_384_t& a) { blst_384_t ret; cneg_mod_384(ret, a, true, MOD); return ret; } inline blst_384_t& operator*=(const blst_384_t& a) { if (this == &a) sqr_mont_384(val, val, MOD, M0); else mul_mont_384(val, val, a, MOD, M0); return *this; } friend inline blst_384_t operator*(const blst_384_t& a, const blst_384_t& b) { blst_384_t ret; if (&a == &b) sqr_mont_384(ret, a, MOD, M0); else mul_mont_384(ret, a, b, MOD, M0); return ret; } // simplified exponentiation, but mind the ^ operator's precedence! friend inline blst_384_t operator^(const blst_384_t& a, unsigned p) { if (p < 2) { abort(); } else if (p == 2) { blst_384_t ret; sqr_mont_384(ret, a, MOD, M0); return ret; } else { blst_384_t ret = a, sqr = a; if ((p&1) == 0) { do { sqr_mont_384(sqr, sqr, MOD, M0); p >>= 1; } while ((p&1) == 0); ret = sqr; } for (p >>= 1; p; p >>= 1) { sqr_mont_384(sqr, sqr, MOD, M0); if (p&1) mul_mont_384(ret, ret, sqr, MOD, M0); } return ret; } } inline blst_384_t& operator^=(unsigned p) { if (p < 2) { abort(); } else if (p == 2) { sqr_mont_384(val, val, MOD, M0); return *this; } return *this = *this^p; } inline blst_384_t operator()(unsigned p) { return *this^p; } friend inline blst_384_t sqr(const blst_384_t& a) { return a^2; } inline bool is_one() const { return vec_is_equal(val, ONE, sizeof(val)); } inline int is_zero() const { return vec_is_zero(val, sizeof(val)); } inline void zero() { vec_zero(val, sizeof(val)); } friend inline blst_384_t czero(const blst_384_t& a, int set_z) { blst_384_t ret; const vec384 zero = { 0 }; vec_select(ret, zero, a, sizeof(ret), set_z); return ret; } static inline blst_384_t csel(const blst_384_t& a, const blst_384_t& b, int sel_a) { blst_384_t ret; vec_select(ret, a, b, sizeof(ret), sel_a); return ret; } blst_384_t reciprocal() const { static const blst_384_t MODx{MOD, true}; union { vec768 x; vec384 r[2]; } temp; ct_inverse_mod_384(temp.x, val, MOD, MODx); redc_mont_384(temp.r[0], temp.x, MOD, M0); mul_mont_384(temp.r[0], temp.r[0], RR, MOD, M0); return *reinterpret_cast(temp.r[0]); } friend inline blst_384_t operator/(unsigned one, const blst_384_t& a) { if (one == 1) return a.reciprocal(); abort(); } friend inline blst_384_t operator/(const blst_384_t& a, const blst_384_t& b) { return a * b.reciprocal(); } inline blst_384_t& operator/=(const blst_384_t& a) { return *this *= a.reciprocal(); } inline blst_384_t(const char *hexascii) { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } friend inline bool operator==(const blst_384_t& a, const blst_384_t& b) { return vec_is_equal(a, b, sizeof(vec384)); } friend inline bool operator!=(const blst_384_t& a, const blst_384_t& b) { return !vec_is_equal(a, b, sizeof(vec384)); } template friend OStream& operator<<(OStream& os, const blst_384_t& obj) { unsigned char be[sizeof(obj)]; char buf[2+2*sizeof(obj)+1], *str = buf; be_bytes_from_limbs(be, blst_384_t{obj}.from(), sizeof(obj)); *str++ = '0', *str++ = 'x'; for (size_t i = 0; i < sizeof(obj); i++) *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); *str = '\0'; return os << buf; } }; template class blst_256_t { vec256 val; inline operator const limb_t*() const { return val; } inline operator limb_t*() { return val; } inline limb_t& operator[](size_t i) { return val[i]; } inline const limb_t& operator[](size_t i) const { return val[i]; } static const size_t n = sizeof(vec256)/sizeof(limb_t); public: static const size_t nbits = N; static constexpr size_t bit_length() { return N; } static const unsigned int degree = 1; typedef byte pow_t[256/8]; typedef blst_256_t mem_t; inline blst_256_t() {} inline blst_256_t(const vec256 p, bool align = false) { if (align) vec_left_align(val, p, N); else vec_copy(val, p, sizeof(val)); } inline blst_256_t(uint64_t a) { vec_zero(val, sizeof(val)); val[0] = a; if (a) to(); } inline blst_256_t(int a) : blst_256_t((uint64_t)a) {} #if defined(__CUDACC__) || defined(__HIPCC__) # if __cplusplus < 201402L && _MSVC_LANG-0 < 201402L # error "C++ >= 14 is required to compile /src/blst_t.hpp for CUDA" # endif template constexpr blst_256_t(limb_t a0, Ts... arr) { limb_t temp[7] = {arr...}; if (sizeof...(arr) < 4) { val[0] = a0; val[1] = temp[0]; val[2] = temp[1]; val[3] = temp[2]; } else { val[0] = a0 | (temp[0] << 32); val[1] = temp[1] | (temp[2] << 32); val[2] = temp[3] | (temp[4] << 32); val[3] = temp[5] | (temp[6] << 32); } } #else template constexpr blst_256_t(limb_t a0, Ts... arr) : val{a0, arr...} {} #endif inline void to_scalar(pow_t& scalar) const { const union { long one; char little; } is_endian = { 1 }; if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { from_mont_256((limb_t *)scalar, val, MOD, M0); } else { vec256 out; from_mont_256(out, val, MOD, M0); le_bytes_from_limbs(scalar, out, sizeof(pow_t)); vec_zero(out, sizeof(out)); } } static inline const blst_256_t& one() { return *reinterpret_cast(ONE); } static inline blst_256_t one(bool or_zero) { blst_256_t ret; limb_t mask = ~((limb_t)0 - or_zero); for (size_t i = 0; i < n; i++) ret[i] = ONE[i] & mask; return ret; } inline blst_256_t& to() { mul_mont_sparse_256(val, val, RR, MOD, M0); return *this; } inline blst_256_t& to(const uint64_t a[2*n]) { mul_mont_sparse_256(val, RR, (const limb_t*)(a + n), MOD, M0); add_mod_256(val, val, (const limb_t*)a, MOD); mul_mont_sparse_256(val, RR, val, MOD, M0); return *this; } blst_256_t& to(const unsigned char* bytes, size_t n, bool le = false) { vec_zero(val, sizeof(val)); vec256 digit; size_t rem = (n - 1) % 32 + 1; n -= rem; if (le) { limbs_from_le_bytes(val, bytes += n, rem); mul_mont_sparse_256(val, RR, val, MOD, M0); while (n) { limbs_from_le_bytes(digit, bytes -= 32, 32); add_mod_256(val, val, digit, MOD); mul_mont_sparse_256(val, RR, val, MOD, M0); n -= 32; } } else { limbs_from_be_bytes(val, bytes, rem); mul_mont_sparse_256(val, RR, val, MOD, M0); bytes += rem; while (n) { limbs_from_be_bytes(digit, bytes, 32); add_mod_256(val, val, digit, MOD); mul_mont_sparse_256(val, RR, val, MOD, M0); bytes += 32; n -= 32; } } return *this; } inline blst_256_t& from() { from_mont_256(val, val, MOD, M0); return *this; } inline blst_256_t& from(const uint64_t a[2*n]) { redc_mont_256(val, (const limb_t*)a, MOD, M0); mul_mont_sparse_256(val, RR, val, MOD, M0); return *this; } inline blst_256_t& from(const unsigned char *bytes, size_t n, bool le = false) { if (n > 64) return to(bytes, n, le).from(); if (n > 32) { vec512 temp{0}; if (le) limbs_from_le_bytes(temp, bytes, n); else limbs_from_be_bytes(temp, bytes, n); redc_mont_256(val, temp, MOD, M0); mul_mont_sparse_256(val, RR, val, MOD, M0); } else { vec_zero(val, sizeof(val)); if (le) limbs_from_le_bytes(val, bytes, n); else limbs_from_be_bytes(val, bytes, n); mul_mont_sparse_256(val, ONE, val, MOD, M0); } return *this; } inline void store(limb_t *p) const { vec_copy(p, val, sizeof(val)); } inline blst_256_t& operator+=(const blst_256_t& b) { add_mod_256(val, val, b, MOD); return *this; } friend inline blst_256_t operator+(const blst_256_t& a, const blst_256_t& b) { blst_256_t ret; add_mod_256(ret, a, b, MOD); return ret; } inline blst_256_t& operator<<=(unsigned l) { lshift_mod_256(val, val, l, MOD); return *this; } friend inline blst_256_t operator<<(const blst_256_t& a, unsigned l) { blst_256_t ret; lshift_mod_256(ret, a, l, MOD); return ret; } inline blst_256_t& operator>>=(unsigned r) { rshift_mod_256(val, val, r, MOD); return *this; } friend inline blst_256_t operator>>(const blst_256_t& a, unsigned r) { blst_256_t ret; rshift_mod_256(ret, a, r, MOD); return ret; } inline blst_256_t& operator-=(const blst_256_t& b) { sub_mod_256(val, val, b, MOD); return *this; } friend inline blst_256_t operator-(const blst_256_t& a, const blst_256_t& b) { blst_256_t ret; sub_mod_256(ret, a, b, MOD); return ret; } inline blst_256_t& cneg(bool flag) { cneg_mod_256(val, val, flag, MOD); return *this; } friend inline blst_256_t cneg(const blst_256_t& a, bool flag) { blst_256_t ret; cneg_mod_256(ret, a, flag, MOD); return ret; } friend inline blst_256_t operator-(const blst_256_t& a) { blst_256_t ret; cneg_mod_256(ret, a, true, MOD); return ret; } inline blst_256_t& operator*=(const blst_256_t& a) { if (this == &a) sqr_mont_sparse_256(val, val, MOD, M0); else mul_mont_sparse_256(val, val, a, MOD, M0); return *this; } friend inline blst_256_t operator*(const blst_256_t& a, const blst_256_t& b) { blst_256_t ret; if (&a == &b) sqr_mont_sparse_256(ret, a, MOD, M0); else mul_mont_sparse_256(ret, a, b, MOD, M0); return ret; } // simplified exponentiation, but mind the ^ operator's precedence! friend inline blst_256_t operator^(const blst_256_t& a, unsigned p) { if (p < 2) { abort(); } else if (p == 2) { blst_256_t ret; sqr_mont_sparse_256(ret, a, MOD, M0); return ret; } else { blst_256_t ret = a, sqr = a; if ((p&1) == 0) { do { sqr_mont_sparse_256(sqr, sqr, MOD, M0); p >>= 1; } while ((p&1) == 0); ret = sqr; } for (p >>= 1; p; p >>= 1) { sqr_mont_sparse_256(sqr, sqr, MOD, M0); if (p&1) mul_mont_sparse_256(ret, ret, sqr, MOD, M0); } return ret; } } inline blst_256_t& operator^=(unsigned p) { if (p < 2) { abort(); } else if (p == 2) { sqr_mont_sparse_256(val, val, MOD, M0); return *this; } return *this = *this^p; } inline blst_256_t operator()(unsigned p) { return *this^p; } friend inline blst_256_t sqr(const blst_256_t& a) { return a^2; } inline bool is_one() const { return vec_is_equal(val, ONE, sizeof(val)); } inline int is_zero() const { return vec_is_zero(val, sizeof(val)); } inline void zero() { vec_zero(val, sizeof(val)); } friend inline blst_256_t czero(const blst_256_t& a, int set_z) { blst_256_t ret; const vec256 zero = { 0 }; vec_select(ret, zero, a, sizeof(ret), set_z); return ret; } static inline blst_256_t csel(const blst_256_t& a, const blst_256_t& b, int sel_a) { blst_256_t ret; vec_select(ret, a, b, sizeof(ret), sel_a); return ret; } blst_256_t reciprocal() const { static const blst_256_t MODx{MOD, true}; union { vec512 x; vec256 r[2]; } temp; ct_inverse_mod_256(temp.x, val, MOD, MODx); redc_mont_256(temp.r[0], temp.x, MOD, M0); mul_mont_sparse_256(temp.r[0], temp.r[0], RR, MOD, M0); return *reinterpret_cast(temp.r[0]); } friend inline blst_256_t operator/(int one, const blst_256_t& a) { if (one == 1) return a.reciprocal(); abort(); } friend inline blst_256_t operator/(const blst_256_t& a, const blst_256_t& b) { return a * b.reciprocal(); } inline blst_256_t& operator/=(const blst_256_t& a) { return *this *= a.reciprocal(); } inline blst_256_t(const char *hexascii) { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } friend inline bool operator==(const blst_256_t& a, const blst_256_t& b) { return vec_is_equal(a, b, sizeof(vec256)); } friend inline bool operator!=(const blst_256_t& a, const blst_256_t& b) { return !vec_is_equal(a, b, sizeof(vec256)); } template friend OStream& operator<<(OStream& os, const blst_256_t& obj) { unsigned char be[sizeof(obj)]; char buf[2+2*sizeof(obj)+1], *str=buf; be_bytes_from_limbs(be, blst_256_t{obj}.from(), sizeof(obj)); *str++ = '0', *str++ = 'x'; for (size_t i = 0; i < sizeof(obj); i++) *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); *str = '\0'; return os << buf; } }; #endif ================================================ FILE: src/bulk_addition.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "fields.h" #include "point.h" /* * This implementation uses explicit addition formula: * * λ = (Y₂-Y₁)/(X₂-X₁) * X₃ = λ²-(X₁+X₂) * Y₃ = λ⋅(X₁-X₃)-Y₁ * * But since we don't know if we'll have to add point to itself, we need * to eventually resort to corresponding doubling formula: * * λ = 3X₁²/2Y₁ * X₃ = λ²-2X₁ * Y₃ = λ⋅(X₁-X₃)-Y₁ * * The formulae use prohibitively expensive inversion, but whenever we * have a lot of affine points to accumulate, we can amortize the cost * by applying Montgomery's batch inversion approach. As a result, * asymptotic[!] per-point cost for addition is as small as 5M+1S. For * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things * considered, the improvement coefficient varies from 60% to 85% * depending on platform and curve. * * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an * application that requires constant time-ness, speak up!] */ /* * Calculate λ's numerator and denominator. * * input: A x1 y1 - * B x2 y2 - * output: * if A!=B: A x1 y1 (x2-x1)*mul_acc * B x2+x1 y2-y1 (x2-x1) * * if A==B: A x y 2y*mul_acc * B 2x 3*x^2 2y * * if A==-B: A 0 0 1*mul_acc * B 0 3*x^2 0 */ #define HEAD(ptype, bits, field, one) \ static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \ { \ ptype *A = AB, *B = AB+1; \ limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \ vec_is_zero(B, sizeof(ptype##_affine)); \ static const vec##bits zero = { 0 }; \ \ sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \ add_##field(B->X, B->X, A->X); /* X2+X1 */ \ add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \ sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \ if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \ inf = vec_is_zero(A->Z, sizeof(A->Z)); \ vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \ sqr_##field(B->Y, A->X); \ mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \ vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \ } /* B->Y is numenator */ \ /* B->Z is denominator */ \ vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \ vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \ vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \ vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \ if (mul_acc != NULL) \ mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\ } /* * Calculate λ and resulting coordinates. * * input: A x1 y1 - * B x2+x1 nominator - * lambda 1/denominator * output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1 */ #define TAIL(ptype, bits, field, one) \ static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \ { \ ptype *A = AB, *B = AB+1; \ vec##bits llambda; \ limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \ \ mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \ /* alt. 3*X1^2/2*Y1 */ \ sqr_##field(llambda, lambda); \ sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \ \ sub_##field(D->Y, A->X, D->X); \ mul_##field(D->Y, D->Y, lambda); \ sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \ \ vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \ vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \ } /* * |points[]| is volatile buffer with |X|s and |Y|s initially holding * input affine coordinates, and with |Z|s being used as additional * temporary storage [unrelated to Jacobian coordinates]. |sum| is * in-/output, initialize to infinity accordingly. */ #define ADDITION_BTREE(prefix, ptype, bits, field, one) \ HEAD(ptype, bits, field, one) \ TAIL(ptype, bits, field, one) \ static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ { \ ptype *dst; \ void *mul_acc; \ size_t i; \ \ while (n >= 16) { \ if (n & 1) \ ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ n /= 2; \ for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \ ptype##_head(points, mul_acc); \ \ reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \ \ for (dst = points, i = n; --i;) { \ dst--; points -= 2; \ mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \ ptype##_tail(dst, points, points[-2].Z); \ mul_##field(points[-2].Z, points[0].Z, points[1].Z); \ } \ dst--; points -= 2; \ ptype##_tail(dst, points, points[0].Z); \ points = dst; \ } \ while (n--) \ ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ } \ \ void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ size_t npoints) \ { \ const size_t stride = SCRATCH_LIMIT / sizeof(ptype); \ ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ sizeof(ptype)); \ const ptype##_affine *point = NULL; \ \ vec_zero(sum, sizeof(*sum)); \ while (npoints) { \ size_t i, j = npoints > stride ? stride : npoints; \ for (i=0; i> (8 * (n % sizeof(limb_t)))); } } static inline void limbs_from_le_bytes(limb_t *restrict ret, const unsigned char *in, size_t n) { limb_t limb = 0; while(n--) { limb <<= 8; limb |= in[n]; /* * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper * to perform redundant stores than to pay penalty for * mispredicted branch. Besides, some compilers unroll the * loop and remove redundant stores to 'restrict'-ed storage... */ ret[n / sizeof(limb_t)] = limb; } } static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, size_t n) { const union { long one; char little; } is_endian = { 1 }; limb_t limb; size_t i, j, r; if ((uptr_t)out == (uptr_t)in && is_endian.little) return; r = n % sizeof(limb_t); n /= sizeof(limb_t); for(i = 0; i < n; i++) { for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8) *out++ = (unsigned char)limb; } if (r) { for (limb = in[i], j = 0; j < r; j++, limb >>= 8) *out++ = (unsigned char)limb; } } static inline char hex_from_nibble(unsigned char nibble) { int mask = (9 - (nibble &= 0xf)) >> 31; return (char)(nibble + ((('a'-10) & mask) | ('0' & ~mask))); } static unsigned char nibble_from_hex(char c) { int mask, ret; mask = (('a'-c-1) & (c-1-'f')) >> 31; ret = (10 + c - 'a') & mask; mask = (('A'-c-1) & (c-1-'F')) >> 31; ret |= (10 + c - 'A') & mask; mask = (('0'-c-1) & (c-1-'9')) >> 31; ret |= (c - '0') & mask; mask = ((ret-1) & ~mask) >> 31; ret |= 16 & mask; return (unsigned char)ret; } static void bytes_from_hexascii(unsigned char *ret, size_t sz, const char *hex) { size_t len; unsigned char b = 0; if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) hex += 2; for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; bytes_zero(ret, sz); while(len--) { b <<= 4; b |= nibble_from_hex(*hex++); if (len % 2 == 0) ret[len / 2] = b; } } static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex) { size_t len; limb_t limb = 0; if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) hex += 2; for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; vec_zero(ret, sz); while(len--) { limb <<= 4; limb |= nibble_from_hex(*hex++); if (len % (2*sizeof(limb_t)) == 0) ret[len / (2*sizeof(limb_t))] = limb; } } #endif ================================================ FILE: src/client_min_pk.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "keygen.c" #include "e2.c" #include "hash_to_field.c" #include "map_to_g2.c" #include "e1.c" #include "exp.c" #include "sqrt.c" #include "recip.c" #include "consts.c" #include "vect.c" #include "exports.c" ================================================ FILE: src/client_min_sig.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "keygen.c" #include "e1.c" #include "hash_to_field.c" #include "map_to_g1.c" #include "e2.c" #include "exp.c" #include "sqrt.c" #include "recip.c" #include "consts.c" #include "vect.c" #include "exports.c" ================================================ FILE: src/consts.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "consts.h" /* z = -0xd201000000010000 */ const vec384 BLS12_381_P = { /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */ TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff), TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) }; const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */ { { ONE_MONT_P }, { 0 } } }; const vec384 BLS12_381_RR = { /* (1<<768)%P, "radix"^2, to-Montgomery */ TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1), TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0), TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa) }; const vec256 BLS12_381_r = { /* z^4 - z^2 + 1, group order */ TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe), TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48) }; const vec256 BLS12_381_rRR = { /* (1<<512)%r, "radix"^2, to-Montgomery */ TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23), TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11) }; ================================================ FILE: src/consts.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_381_ASM_CONST_H__ #define __BLS12_381_ASM_CONST_H__ #include "vect.h" extern const vec384 BLS12_381_P; extern const limb_t BLS12_381_p0; static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384; extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */ extern const vec384 BLS12_381_RR; /* (1<<768)%P, "radix"^2, to-Montgomery */ #define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \ TO_LIMB_T(0xebf4000bc40c0002), \ TO_LIMB_T(0x5f48985753c758ba), \ TO_LIMB_T(0x77ce585370525745), \ TO_LIMB_T(0x5c071a97a256ec6d), \ TO_LIMB_T(0x15f65ec3fa80e493) #define ZERO_384 (BLS12_381_Rx.p2[1]) extern const vec256 BLS12_381_r; /* order */ static const limb_t r0 = (limb_t)0xfffffffeffffffff; /* -1/r */ extern const vec256 BLS12_381_rRR; /* (1<<512)%r, "radix"^2, to-Montgomery */ #endif ================================================ FILE: src/cpuid.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #if (defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C)) && !defined(_WIN32) __attribute__((visibility("hidden"))) #endif int __blst_platform_cap = 0; #if defined(__x86_64__) || defined(__x86_64) || (defined(_M_X64) && !defined(_M_ARM64EC)) # if defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C) static void __cpuidex(int info[4], int func, int sub) { int eax, ebx, ecx, edx; __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(func), "c"(sub)); info[0] = eax; info[1] = ebx; info[2] = ecx; info[3] = edx; } # else # include # endif # if defined(__GNUC__) || defined(__clang__) __attribute__((constructor)) # endif static int __blst_cpuid(void) { int info[4], cap = 0; __cpuidex(info, 0, 0); if (info[0] > 6) { __cpuidex(info, 7, 0); cap |= (info[1]>>19) & 1; /* ADX */ cap |= (info[1]>>28) & 2; /* SHA */ } __blst_platform_cap = cap; return 0; } # if defined(_MSC_VER) && !defined(__clang__) && !defined(__BLST_DLL_MAIN__) # pragma section(".CRT$XCU",read) __declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid; # elif defined(__SUNPRO_C) # pragma init(__blst_cpuid) # endif #elif defined(__aarch64__) || defined(__aarch64) || defined(_M_ARM64) || defined(_M_ARM64EC) # if defined(__linux__) && (defined(__GNUC__) || defined(__clang__)) extern unsigned long getauxval(unsigned long type) __attribute__ ((weak)); __attribute__((constructor)) static int __blst_cpuid(void) { int cap = 0; if (getauxval) { unsigned long hwcap_ce = getauxval(16); cap = (hwcap_ce>>6) & 1; /* SHA256 */ } __blst_platform_cap = cap; return 0; } # elif defined(__APPLE__) && (defined(__GNUC__) || defined(__clang__)) __attribute__((constructor)) static int __blst_cpuid() { __blst_platform_cap = 1; /* SHA256 */ return 0; } # elif defined(__FreeBSD__) && __FreeBSD__ >= 12 # include __attribute__((constructor)) static int __blst_cpuid() { unsigned long cap; if (elf_aux_info(AT_HWCAP, &cap, sizeof(cap)) == 0) __blst_platform_cap = (cap & HWCAP_SHA2) != 0; return 0; } # elif defined(_WIN64) int IsProcessorFeaturePresent(int); # if defined(__GNUC__) || defined(__clang__) __attribute__((constructor)) # endif static int __blst_cpuid(void) { __blst_platform_cap = IsProcessorFeaturePresent(30); /* AES, SHA1, SHA2 */ return 0; } # if defined(_MSC_VER) && !defined(__clang__) && !defined(__BLST_DLL_MAIN__) # pragma section(".CRT$XCU",read) __declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid; # endif # endif #endif #if defined(_WIN64) && defined(__BLST_DLL_MAIN__) # define IsProcessorFeaturePresent mask_IsProcessorFeaturePresent # define WIN32_LEAN_AND_MEAN # include BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD dwReason, LPVOID lpvReserved) { if (dwReason == DLL_PROCESS_ATTACH) { DisableThreadLibraryCalls(hinstDLL); __blst_cpuid(); } return TRUE; (void)lpvReserved; } # if defined(_MSC_VER) /* * Even though we don't have memcpy/memset anywhere, MSVC compiler * generates calls to them as it recognizes corresponding patterns. */ #pragma function(memcpy) void *memcpy(unsigned char *dst, const unsigned char *src, size_t n) { void *ret = dst; while(n--) *dst++ = *src++; return ret; } #pragma function(memset) void *memset(unsigned char *dst, int c, size_t n) { void *ret = dst; while(n--) *dst++ = (unsigned char)c; return ret; } # endif #endif ================================================ FILE: src/e1.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "point.h" #include "fields.h" #include "errors.h" /* * y^2 = x^3 + B */ static const vec384 B_E1 = { /* (4 << 384) % P */ TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }; const POINTonE1 BLS12_381_G1 = { /* generator point [in Montgomery] */ /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6 * 00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */ { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e), TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194), TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) }, { ONE_MONT_P } }; const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */ /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9 * 6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */ { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270), TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a), TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) }, { ONE_MONT_P } }; static inline void mul_by_b_onE1(vec384 out, const vec384 in) { lshift_fp(out, in, 2); } static inline void mul_by_4b_onE1(vec384 out, const vec384 in) { lshift_fp(out, in, 4); } static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit) { cneg_fp(p->Y, p->Y, cbit); } void blst_p1_cneg(POINTonE1 *a, int cbit) { POINTonE1_cneg(a, is_zero(cbit) ^ 1); } static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) { vec384 Z, ZZ; limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); reciprocal_fp(Z, in->Z); /* 1/Z */ sqr_fp(ZZ, Z); mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ mul_fp(ZZ, ZZ, Z); mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ vec_select(out->Z, in->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ } void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) { POINTonE1_from_Jacobian(out, a); } static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) { POINTonE1 p; if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { POINTonE1_from_Jacobian(&p, in); in = &p; } vec_copy(out, in, sizeof(*out)); } void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) { POINTonE1_to_affine(out, a); } void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) { vec_copy(out, a, sizeof(*a)); vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), vec_is_zero(a, sizeof(*a))); } static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) { vec384 XXX, YY; sqr_fp(XXX, p->X); mul_fp(XXX, XXX, p->X); /* X^3 */ add_fp(XXX, XXX, B_E1); /* X^3 + B */ sqr_fp(YY, p->Y); /* Y^2 */ return vec_is_equal(XXX, YY, sizeof(XXX)); } int blst_p1_affine_on_curve(const POINTonE1_affine *p) { return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } static bool_t POINTonE1_on_curve(const POINTonE1 *p) { vec384 XXX, YY, BZ6; limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); sqr_fp(BZ6, p->Z); mul_fp(BZ6, BZ6, p->Z); sqr_fp(BZ6, BZ6); /* Z^6 */ mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ sqr_fp(XXX, p->X); mul_fp(XXX, XXX, p->X); /* X^3 */ add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ sqr_fp(YY, p->Y); /* Y^2 */ return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; } int blst_p1_on_curve(const POINTonE1 *p) { return (int)POINTonE1_on_curve(p); } static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], const POINTonE1_affine *in) { vec384 temp; from_fp(temp, in->X); be_bytes_from_limbs(out, temp, sizeof(temp)); from_fp(temp, in->Y); be_bytes_from_limbs(out + 48, temp, sizeof(temp)); return sgn0_pty_mod_384(temp, BLS12_381_P); } void blst_p1_affine_serialize(unsigned char out[96], const POINTonE1_affine *in) { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 96); out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE1_affine_Serialize_BE(out, in); } } static limb_t POINTonE1_Serialize_BE(unsigned char out[96], const POINTonE1 *in) { POINTonE1 p; if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { POINTonE1_from_Jacobian(&p, in); in = &p; } return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); } static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 96); out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE1_Serialize_BE(out, in); } } void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) { POINTonE1_Serialize(out, in); } static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], const POINTonE1_affine *in) { vec384 temp; from_fp(temp, in->X); be_bytes_from_limbs(out, temp, sizeof(temp)); return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); } void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 48); out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE1_affine_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); } } static limb_t POINTonE1_Compress_BE(unsigned char out[48], const POINTonE1 *in) { POINTonE1 p; if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { POINTonE1_from_Jacobian(&p, in); in = &p; } return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); } void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 48); out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE1_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); } } static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, const unsigned char in[48]) { POINTonE1_affine ret; vec384 temp; limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); /* clear top 3 bits in case caller was conveying some information there */ ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.X, sizeof(temp))) return (limb_t)0 - BLST_BAD_ENCODING; mul_fp(ret.X, ret.X, BLS12_381_RR); sqr_fp(ret.Y, ret.X); mul_fp(ret.Y, ret.Y, ret.X); add_fp(ret.Y, ret.Y, B_E1); /* X^3 + B */ if (!sqrt_fp(ret.Y, ret.Y)) return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; vec_copy(out, &ret, sizeof(ret)); return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); } static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out, const unsigned char in[48]) { unsigned char in0 = in[0]; limb_t sgn0_pty; if ((in0 & 0x80) == 0) /* compressed bit */ return BLST_BAD_ENCODING; if (in0 & 0x40) { /* infinity bit */ if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) { vec_zero(out, sizeof(*out)); return BLST_SUCCESS; } else { return BLST_BAD_ENCODING; } } sgn0_pty = POINTonE1_Uncompress_BE(out, in); if (sgn0_pty > 3) return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ sgn0_pty >>= 1; /* skip over parity bit */ sgn0_pty ^= (in0 & 0x20) >> 5; cneg_fp(out->Y, out->Y, sgn0_pty); /* (0,±2) is not in group, but application might want to ignore? */ return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP : BLST_SUCCESS; } BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) { return POINTonE1_Uncompress_Z(out, in); } static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out, const unsigned char in[96]) { POINTonE1_affine ret; vec384 temp; limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y)); /* clear top 3 bits in case caller was conveying some information there */ ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.X, sizeof(temp))) return BLST_BAD_ENCODING; add_fp(temp, ret.Y, ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.Y, sizeof(temp))) return BLST_BAD_ENCODING; mul_fp(ret.X, ret.X, BLS12_381_RR); mul_fp(ret.Y, ret.Y, BLS12_381_RR); if (!POINTonE1_affine_on_curve(&ret)) return BLST_POINT_NOT_ON_CURVE; vec_copy(out, &ret, sizeof(ret)); /* (0,±2) is not in group, but application might want to ignore? */ return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP : BLST_SUCCESS; } static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out, const unsigned char in[96]) { unsigned char in0 = in[0]; if ((in0 & 0xe0) == 0) return POINTonE1_Deserialize_BE(out, in); if (in0 & 0x80) /* compressed bit */ return POINTonE1_Uncompress_Z(out, in); if (in0 & 0x40) { /* infinity bit */ if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { vec_zero(out, sizeof(*out)); return BLST_SUCCESS; } } return BLST_BAD_ENCODING; } BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, const unsigned char in[96]) { return POINTonE1_Deserialize_Z(out, in); } #include "ec_ops.h" POINT_DADD_IMPL(POINTonE1, 384, fp) POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) POINT_ADD_IMPL(POINTonE1, 384, fp) POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp) void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) { POINTonE1_add(out, a, b); } void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) { POINTonE1_dadd(out, a, b, NULL); } void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, const POINTonE1_affine *b) { POINTonE1_add_affine(out, a, b); } void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, const POINTonE1_affine *b) { POINTonE1_dadd_affine(out, a, b); } void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) { POINTonE1_double(out, a); } int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b) { return (int)POINTonE1_is_equal(a, b); } #include "ec_mult.h" POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4) POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5) #ifdef __BLST_PRIVATE_TESTMODE__ POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) #endif static const vec384 beta = { /* such that beta^3 - 1 = 0 */ /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }; static void sigma(POINTonE1 *out, const POINTonE1 *in) { vec_copy(out->X, in->X, 2*sizeof(out->X)); mul_fp(out->Z, in->Z, beta); } /* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */ static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) { union { vec256 l; pow256 s; } val; /* SK/z^2 [in constant time] */ limbs_from_le_bytes(val.l, SK, 32); div_by_zz(val.l); le_bytes_from_limbs(val.s, val.l, 32); { const byte *scalars[2] = { val.s+16, val.s }; POINTonE1 table[2][1<<(5-1)]; /* 4.5KB */ size_t i; POINTonE1_precompute_w5(table[0], in); for (i = 0; i < 1<<(5-1); i++) { mul_fp(table[1][i].X, table[0][i].X, beta); cneg_fp(table[1][i].Y, table[0][i].Y, 1); vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z)); } POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table); POINTonE1_cneg(out, 1); mul_fp(out->Z, out->Z, beta); mul_fp(out->Z, out->Z, beta); } vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ } static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) { vec384 Z, ZZ; limb_t inf; POINTonE1_mult_glv(out, in, SK); /* convert to affine to remove possible bias in out->Z */ inf = vec_is_zero(out->Z, sizeof(out->Z)); #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION flt_reciprocal_fp(Z, out->Z); /* 1/Z */ #else reciprocal_fp(Z, out->Z); /* 1/Z */ #endif sqr_fp(ZZ, Z); mul_fp(out->X, out->X, ZZ); /* X = X/Z^2 */ mul_fp(ZZ, ZZ, Z); mul_fp(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ } void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK) { POINTonE1_sign(out, &BLS12_381_G1, SK); } void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK) { POINTonE1_sign(out, msg, SK); } void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, const pow256 SK) { POINTonE1 P[1]; POINTonE1_sign(P, &BLS12_381_G1, SK); if (PK != NULL) vec_copy(PK, P, sizeof(*PK)); if (out != NULL) { limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; } } void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, const POINTonE1 *hash, const pow256 SK) { POINTonE1 P[1]; POINTonE1_sign(P, hash, SK); if (sig != NULL) vec_copy(sig, P, sizeof(*sig)); if (out != NULL) { limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; } } void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a, const byte *scalar, size_t nbits) { if (nbits < 176) { if (nbits) POINTonE1_mult_w4(out, a, scalar, nbits); else vec_zero(out, sizeof(*out)); } else if (nbits <= 256) { union { vec256 l; pow256 s; } val; size_t i, j, top, mask = (size_t)0 - 1; /* this is not about constant-time-ness, but branch optimization */ for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); j += 1 & mask; } if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ POINTonE1_mult_glv(out, a, val.s); else /* should never be the case, added for formal completeness */ POINTonE1_mult_w5(out, a, scalar, nbits); vec_zero(val.l, sizeof(val)); } else { /* should never be the case, added for formal completeness */ POINTonE1_mult_w5(out, a, scalar, nbits); } } void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a, const byte *scalar, size_t nbits) { if (nbits) POINTonE1_mult_w4(out, a, scalar, nbits); else vec_zero(out, sizeof(*out)); } int blst_p1_affine_is_equal(const POINTonE1_affine *a, const POINTonE1_affine *b) { return (int)vec_is_equal(a, b, sizeof(*a)); } int blst_p1_is_inf(const POINTonE1 *p) { return (int)vec_is_zero(p->Z, sizeof(p->Z)); } const POINTonE1 *blst_p1_generator(void) { return &BLS12_381_G1; } int blst_p1_affine_is_inf(const POINTonE1_affine *p) { return (int)vec_is_zero(p, sizeof(*p)); } const POINTonE1_affine *blst_p1_affine_generator(void) { return (const POINTonE1_affine *)&BLS12_381_G1; } size_t blst_p1_sizeof(void) { return sizeof(POINTonE1); } size_t blst_p1_affine_sizeof(void) { return sizeof(POINTonE1_affine); } ================================================ FILE: src/e2.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "point.h" #include "fields.h" #include "errors.h" /* * y^2 = x^3 + B */ static const vec384x B_E2 = { /* 4 + 4*i */ { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } }; const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ { /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } }, { /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, }, { { ONE_MONT_P }, { 0 } } }; const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ { /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } }, { /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } }, { { ONE_MONT_P }, { 0 } } }; static void mul_by_b_onE2(vec384x out, const vec384x in) { sub_fp(out[0], in[0], in[1]); add_fp(out[1], in[0], in[1]); lshift_fp(out[0], out[0], 2); lshift_fp(out[1], out[1], 2); } static void mul_by_4b_onE2(vec384x out, const vec384x in) { sub_fp(out[0], in[0], in[1]); add_fp(out[1], in[0], in[1]); lshift_fp(out[0], out[0], 4); lshift_fp(out[1], out[1], 4); } static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit) { cneg_fp2(p->Y, p->Y, cbit); } void blst_p2_cneg(POINTonE2 *a, int cbit) { POINTonE2_cneg(a, is_zero(cbit) ^ 1); } static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) { vec384x Z, ZZ; limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); reciprocal_fp2(Z, in->Z); /* 1/Z */ sqr_fp2(ZZ, Z); mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ mul_fp2(ZZ, ZZ, Z); mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ vec_select(out->Z, in->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ } void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) { POINTonE2_from_Jacobian(out, a); } static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) { POINTonE2 p; if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { POINTonE2_from_Jacobian(&p, in); in = &p; } vec_copy(out, in, sizeof(*out)); } void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) { POINTonE2_to_affine(out, a); } void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) { vec_copy(out, a, sizeof(*a)); vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), vec_is_zero(a, sizeof(*a))); } static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) { vec384x XXX, YY; sqr_fp2(XXX, p->X); mul_fp2(XXX, XXX, p->X); /* X^3 */ add_fp2(XXX, XXX, B_E2); /* X^3 + B */ sqr_fp2(YY, p->Y); /* Y^2 */ return vec_is_equal(XXX, YY, sizeof(XXX)); } int blst_p2_affine_on_curve(const POINTonE2_affine *p) { return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } static bool_t POINTonE2_on_curve(const POINTonE2 *p) { vec384x XXX, YY, BZ6; limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); sqr_fp2(BZ6, p->Z); mul_fp2(BZ6, BZ6, p->Z); sqr_fp2(XXX, BZ6); /* Z^6 */ mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ sqr_fp2(XXX, p->X); mul_fp2(XXX, XXX, p->X); /* X^3 */ add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ sqr_fp2(YY, p->Y); /* Y^2 */ return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; } int blst_p2_on_curve(const POINTonE2 *p) { return (int)POINTonE2_on_curve(p); } static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], const POINTonE2_affine *in) { vec384x temp; from_fp(temp[1], in->X[1]); be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); from_fp(temp[0], in->X[0]); be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); from_fp(temp[1], in->Y[1]); be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); from_fp(temp[0], in->Y[0]); be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); return sgn0_pty_mod_384x(temp, BLS12_381_P); } void blst_p2_affine_serialize(unsigned char out[192], const POINTonE2_affine *in) { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 192); out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE2_affine_Serialize_BE(out, in); } } static limb_t POINTonE2_Serialize_BE(unsigned char out[192], const POINTonE2 *in) { POINTonE2 p; if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { POINTonE2_from_Jacobian(&p, in); in = &p; } return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); } static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 192); out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE2_Serialize_BE(out, in); } } void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) { POINTonE2_Serialize(out, in); } static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], const POINTonE2_affine *in) { vec384 temp; from_fp(temp, in->X[1]); be_bytes_from_limbs(out, temp, sizeof(temp)); from_fp(temp, in->X[0]); be_bytes_from_limbs(out + 48, temp, sizeof(temp)); return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); } void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 96); out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE2_affine_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); } } static limb_t POINTonE2_Compress_BE(unsigned char out[96], const POINTonE2 *in) { POINTonE2 p; if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { POINTonE2_from_Jacobian(&p, in); in = &p; } return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); } void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 96); out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE2_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); } } static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, const unsigned char in[96]) { POINTonE2_affine ret; vec384 temp; limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); /* clear top 3 bits in case caller was conveying some information there */ ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) return (limb_t)0 - BLST_BAD_ENCODING; add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) return (limb_t)0 - BLST_BAD_ENCODING; mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); sqr_fp2(ret.Y, ret.X); mul_fp2(ret.Y, ret.Y, ret.X); add_fp2(ret.Y, ret.Y, B_E2); /* X^3 + B */ if (!sqrt_fp2(ret.Y, ret.Y)) return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; vec_copy(out, &ret, sizeof(ret)); return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); } static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out, const unsigned char in[96]) { unsigned char in0 = in[0]; limb_t sgn0_pty; if ((in0 & 0x80) == 0) /* compressed bit */ return BLST_BAD_ENCODING; if (in0 & 0x40) { /* infinity bit */ if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { vec_zero(out, sizeof(*out)); return BLST_SUCCESS; } else { return BLST_BAD_ENCODING; } } sgn0_pty = POINTonE2_Uncompress_BE(out, in); if (sgn0_pty > 3) return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ sgn0_pty >>= 1; /* skip over parity bit */ sgn0_pty ^= (in0 & 0x20) >> 5; cneg_fp2(out->Y, out->Y, sgn0_pty); return BLST_SUCCESS; } BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) { return POINTonE2_Uncompress_Z(out, in); } static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out, const unsigned char in[192]) { POINTonE2_affine ret; vec384 temp; limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1])); limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0])); /* clear top 3 bits in case caller was conveying some information there */ ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) return BLST_BAD_ENCODING; add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) return BLST_BAD_ENCODING; add_fp(temp, ret.Y[1], ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.Y[1], sizeof(temp))) return BLST_BAD_ENCODING; add_fp(temp, ret.Y[0], ZERO_384); /* less than modulus? */ if (!vec_is_equal(temp, ret.Y[0], sizeof(temp))) return BLST_BAD_ENCODING; mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR); mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR); if (!POINTonE2_affine_on_curve(&ret)) return BLST_POINT_NOT_ON_CURVE; vec_copy(out, &ret, sizeof(ret)); return BLST_SUCCESS; } static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out, const unsigned char in[192]) { unsigned char in0 = in[0]; if ((in0 & 0xe0) == 0) return POINTonE2_Deserialize_BE(out, in); if (in0 & 0x80) /* compressed bit */ return POINTonE2_Uncompress_Z(out, in); if (in0 & 0x40) { /* infinity bit */ if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) { vec_zero(out, sizeof(*out)); return BLST_SUCCESS; } } return BLST_BAD_ENCODING; } BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out, const unsigned char in[192]) { return POINTonE2_Deserialize_Z(out, in); } #include "ec_ops.h" POINT_DADD_IMPL(POINTonE2, 384x, fp2) POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) POINT_ADD_IMPL(POINTonE2, 384x, fp2) POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2) void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) { POINTonE2_add(out, a, b); } void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) { POINTonE2_dadd(out, a, b, NULL); } void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, const POINTonE2_affine *b) { POINTonE2_add_affine(out, a, b); } void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, const POINTonE2_affine *b) { POINTonE2_dadd_affine(out, a, b); } void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) { POINTonE2_double(out, a); } int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b) { return (int)POINTonE2_is_equal(a, b); } #include "ec_mult.h" POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4) POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5) #ifdef __BLST_PRIVATE_TESTMODE__ POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) #endif static void psi(POINTonE2 *out, const POINTonE2 *in) { static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */ { 0 }, { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } }; static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */ { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, }; vec_copy(out, in, sizeof(*out)); cneg_fp(out->X[1], out->X[1], 1); mul_fp2(out->X, out->X, frobenius_x); cneg_fp(out->Y[1], out->Y[1], 1); mul_fp2(out->Y, out->Y, frobenius_y); cneg_fp(out->Z[1], out->Z[1], 1); } /* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */ static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) { union { vec256 l; pow256 s; } val; /* break down SK to "digits" with |z| as radix [in constant time] */ limbs_from_le_bytes(val.l, SK, 32); div_by_zz(val.l); div_by_z(val.l); div_by_z(val.l + NLIMBS(256)/2); le_bytes_from_limbs(val.s, val.l, 32); { const byte *scalars[2] = { val.s, NULL }; POINTonE2 table[4][1<<(5-1)]; /* 18KB */ size_t i; POINTonE2_precompute_w5(table[0], in); for (i = 0; i < 1<<(5-1); i++) { psi(&table[1][i], &table[0][i]); psi(&table[2][i], &table[1][i]); psi(&table[3][i], &table[2][i]); POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */ POINTonE2_cneg(&table[3][i], 1); } POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table); } vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ } static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) { vec384x Z, ZZ; limb_t inf; POINTonE2_mult_gls(out, in, SK); /* convert to affine to remove possible bias in out->Z */ inf = vec_is_zero(out->Z, sizeof(out->Z)); #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION flt_reciprocal_fp2(Z, out->Z); /* 1/Z */ #else reciprocal_fp2(Z, out->Z); /* 1/Z */ #endif sqr_fp2(ZZ, Z); mul_fp2(out->X, out->X, ZZ); /* X = X/Z^2 */ mul_fp2(ZZ, ZZ, Z); mul_fp2(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ } void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK) { POINTonE2_sign(out, &BLS12_381_G2, SK); } void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK) { POINTonE2_sign(out, msg, SK); } void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, const pow256 SK) { POINTonE2 P[1]; POINTonE2_sign(P, &BLS12_381_G2, SK); if (PK != NULL) vec_copy(PK, P, sizeof(*PK)); if (out != NULL) { limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; } } void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, const POINTonE2 *hash, const pow256 SK) { POINTonE2 P[1]; POINTonE2_sign(P, hash, SK); if (sig != NULL) vec_copy(sig, P, sizeof(*sig)); if (out != NULL) { limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; } } void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a, const byte *scalar, size_t nbits) { if (nbits < 144) { if (nbits) POINTonE2_mult_w4(out, a, scalar, nbits); else vec_zero(out, sizeof(*out)); } else if (nbits <= 256) { union { vec256 l; pow256 s; } val; size_t i, j, top, mask = (size_t)0 - 1; /* this is not about constant-time-ness, but branch optimization */ for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); j += 1 & mask; } if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ POINTonE2_mult_gls(out, a, val.s); else /* should never be the case, added for formal completeness */ POINTonE2_mult_w5(out, a, scalar, nbits); vec_zero(val.l, sizeof(val)); } else { /* should never be the case, added for formal completeness */ POINTonE2_mult_w5(out, a, scalar, nbits); } } void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a, const byte *scalar, size_t nbits) { if (nbits) POINTonE2_mult_w4(out, a, scalar, nbits); else vec_zero(out, sizeof(*out)); } int blst_p2_affine_is_equal(const POINTonE2_affine *a, const POINTonE2_affine *b) { return (int)vec_is_equal(a, b, sizeof(*a)); } int blst_p2_is_inf(const POINTonE2 *p) { return (int)vec_is_zero(p->Z, sizeof(p->Z)); } const POINTonE2 *blst_p2_generator(void) { return &BLS12_381_G2; } int blst_p2_affine_is_inf(const POINTonE2_affine *p) { return (int)vec_is_zero(p, sizeof(*p)); } const POINTonE2_affine *blst_p2_affine_generator(void) { return (const POINTonE2_affine *)&BLS12_381_G2; } size_t blst_p2_sizeof(void) { return sizeof(POINTonE2); } size_t blst_p2_affine_sizeof(void) { return sizeof(POINTonE2_affine); } ================================================ FILE: src/ec_mult.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_381_ASM_EC_MULT_H__ #define __BLS12_381_ASM_EC_MULT_H__ #include "point.h" /* Works up to 9 bits */ static limb_t get_wval(const byte *d, size_t off, size_t bits) { size_t top = off + bits - 1; limb_t ret; ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; return ret >> (off%8); } /* Works up to 25 bits. */ static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) { size_t i, top = (off + bits - 1)/8; limb_t ret, mask = (limb_t)0 - 1; d += off/8; top -= off/8-1; /* this is not about constant-time-ness, but branch optimization */ for (ret=0, i=0; i<4;) { ret |= (*d & mask) << (8*i); mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); d += 1 & mask; } return ret >> (off%8); } /* * Window value encoding that utilizes the fact that -P is trivially * calculated, which allows to halve the size of pre-computed table, * is attributed to A. D. Booth, hence the name of the subroutines... */ static limb_t booth_encode(limb_t wval, size_t sz) { limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ launder(mask); wval = (wval + 1) >> 1; wval = (wval ^ mask) - mask; /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ return wval; } /* * Key feature of these constant-time subroutines is that they tolerate * zeros in most significant bit positions of the scalar[s], or in other * words, zero-padded scalar values. This means that one can and should * pass order's bit-length, which is customarily publicly known, instead * of the factual scalars' bit-lengths. This is facilitated by point * addition subroutines implemented to handle points at infinity, which * are encoded as Z==0. [Doubling algorithms handle such points at * infinity "naturally," since resulting Z is product of original Z.] */ #define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ static bool_t ptype##_gather_booth_w##SZ(ptype *restrict p, \ const ptype table[1<<(SZ-1)], \ limb_t booth_idx) \ { \ size_t i; \ bool_t booth_sign = (booth_idx >> SZ) & 1; \ \ booth_idx &= (1< 0) \ wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ else \ wval = (scalar[0] << 1) & wmask; \ \ wval = booth_encode(wval, SZ); \ ret_is_inf = ptype##_gather_booth_w##SZ(ret, table[0], wval); \ \ i = 1; \ while (bits > 0) { \ for (; i < npoints; i++) { \ scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ wval = booth_encode(wval, SZ); \ row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \ ptype##_dadd(sum, ret, row, NULL); \ ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ ret_is_inf |= sum_is_inf; \ row_is_inf |= sum_is_inf; \ ptype##_ccopy(ret, row, ret_is_inf); \ ret_is_inf &= row_is_inf; \ } \ \ for (j = 0; j < SZ; j++) \ ptype##_double(ret, ret); \ \ window = SZ; \ wmask = ((limb_t)1 << (window + 1)) - 1; \ bits -= window; \ i = 0; scalar_s = scalars; \ } \ \ for (; i < npoints; i++) { \ scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ wval = (scalar[0] << 1) & wmask; \ wval = booth_encode(wval, SZ); \ row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \ ptype##_dadd(sum, ret, row, NULL); \ ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ ret_is_inf |= sum_is_inf; \ row_is_inf |= sum_is_inf; \ ptype##_ccopy(ret, row, ret_is_inf); \ ret_is_inf &= row_is_inf; \ } \ \ vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \ } \ \ static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ const byte *scalar, size_t bits) \ { \ limb_t wmask, wval; \ size_t j, window; \ ptype sum[1], row[1]; \ bool_t sum_is_inf, row_is_inf, ret_is_inf; \ ptype table[1<<(SZ-1)]; \ \ ptype##_precompute_w##SZ(table, point); \ \ /* top excess bits modulo target window size */ \ window = bits % SZ; /* yes, it may be zero */ \ wmask = ((limb_t)1 << (window + 1)) - 1; \ \ bits -= window; \ wval = bits ? get_wval(scalar, bits - 1, window + 1) \ : (limb_t)scalar[0] << 1; \ wval &= wmask; \ wval = booth_encode(wval, SZ); \ ret_is_inf = ptype##_gather_booth_w##SZ(ret, table, wval); \ \ while (bits > 0) { \ for (j = 0; j < SZ; j++) \ ptype##_double(ret, ret); \ \ window = SZ; \ wmask = ((limb_t)1 << (window + 1)) - 1; \ bits -= window; \ \ wval = bits ? get_wval(scalar, bits - 1, window + 1) \ : (limb_t)scalar[0] << 1; \ wval &= wmask; \ wval = booth_encode(wval, SZ); \ row_is_inf = ptype##_gather_booth_w##SZ(row, table, wval); \ ptype##_dadd(sum, ret, row, NULL); \ ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ ret_is_inf |= sum_is_inf; \ row_is_inf |= sum_is_inf; \ ptype##_ccopy(ret, row, ret_is_inf); \ ret_is_inf &= row_is_inf; \ } \ \ vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \ } #if 0 /* ~50%, or ~2x[!] slower than w5... */ #define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ const byte *scalar, size_t bits) \ { \ ptype sum[1]; \ bool_t bit, pbit = 0; \ \ vec_copy(sum, p, sizeof(ptype)); \ vec_zero(ret, sizeof(ptype)); /* infinity */ \ \ while (bits--) { \ bit = is_bit_set(scalar, bits); \ bit ^= pbit; \ ptype##_cswap(ret, sum, bit); \ ptype##_add(sum, sum, ret); \ ptype##_double(ret, ret); \ pbit ^= bit; \ } \ ptype##_cswap(ret, sum, pbit); \ } #else /* >40% better performance than above, [and ~30% slower than w5]... */ #define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ static void ptype##_mult_ladder(ptype *out, const ptype *p, \ const byte *scalar, size_t bits) \ { \ ptype##xz sum[1]; \ ptype##xz pxz[1]; \ ptype##xz ret[1]; \ bool_t bit, pbit = 0; \ \ ptype##xz_ladder_pre(pxz, p); \ vec_copy(sum, pxz, sizeof(ptype##xz)); \ vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ \ while (bits--) { \ bit = is_bit_set(scalar, bits); \ bit ^= pbit; \ ptype##xz_cswap(ret, sum, bit); \ ptype##xz_ladder_step(ret, sum, pxz); \ pbit ^= bit; \ } \ ptype##xz_cswap(ret, sum, pbit); \ ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ } #endif /* * Sole reason for existence of this implementation is that addition * with affine point renders a share of multiplications redundant by * virtue of Z==1. And since pre-defined generator point can be and * customarily is instantiated affine, it would be hardly appropriate * to pass on this opportunity. Though while it's faster than the * generic ladder implementation, by ~25%, it's not faster than XZ one * above, <15% slower. Just in case, it's faster than generic ladder * even if one accounts for prior conversion to affine coordinates, * so that choice [for resource-constrained case] is actually between * this plus said conversion and XZ ladder... * * To summarize, if ptype##_mult_w5 executed in one unit of time, then * - naive ptype##_mult_ladder would execute in ~2; * - XZ version above - in ~1.4; * - ptype##_affine_mult_ladder below - in ~1.65; * - [small-footprint ptype##_to_affine would run in ~0.18]. * * Caveat lector, |p_affine|*(order+2) produces wrong result, because * addition doesn't handle doubling. Indeed, P*(order+1) is P and it * fails to add with itself producing infinity in last addition. But * as long as |scalar| is reduced modulo order, as it should be, it's * not a problem... */ #define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ static void ptype##_affine_mult_ladder(ptype *ret, \ const ptype##_affine *p_affine, \ const byte *scalar, size_t bits) \ { \ ptype sum[1]; \ bool_t bit; \ \ vec_zero(ret, sizeof(ptype)); /* infinity */ \ \ while (bits--) { \ ptype##_double(ret, ret); \ ptype##_add_affine(sum, ret, p_affine); \ bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ ptype##_ccopy(ret, sum, bit); \ } \ } #endif ================================================ FILE: src/ec_ops.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_384_ASM_EC_OPS_H__ #define __BLS12_384_ASM_EC_OPS_H__ /* * Addition that can handle doubling [as well as points at infinity, * which are encoded as Z==0] in constant time. It naturally comes at * cost, but this subroutine should be called only when independent * points are processed, which is considered reasonable compromise. * For example, ptype##s_mult_w5 calls it, but since *major* gain is * result of pure doublings being effectively divided by amount of * points, slightly slower addition can be tolerated. But what is the * additional cost more specifically? Best addition result is 11M+5S, * while this routine takes 13M+5S (+1M+1S if a4!=0), as per * * -------------+------------- * addition | doubling * -------------+------------- * U1 = X1*Z2^2 | U1 = X1 * U2 = X2*Z1^2 | * S1 = Y1*Z2^3 | S1 = Y1 * S2 = Y2*Z1^3 | * zz = Z1*Z2 | zz = Z1 * H = U2-U1 | H' = 2*Y1 * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] * sx = U1+U2 | sx = X1+X1 * -------------+------------- * H!=0 || R!=0 | H==0 && R==0 * * X3 = R^2-H^2*sx * Y3 = R*(H^2*U1-X3)-H^3*S1 * Z3 = H*zz * * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. */ #define POINT_DADD_IMPL(ptype, bits, field) \ static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ const vec##bits a4) \ { \ ptype p3; /* starts as (U1, S1, zz) from addition side */\ struct { vec##bits H, R, sx; } add, dbl; \ bool_t p1inf, p2inf, is_dbl; \ \ add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ sqr_##field(dbl.R, p1->X); /* X1^2 */\ mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ \ p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ sqr_##field(p3.X, p2->Z); /* Z2^2 */\ mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ sqr_##field(add.H, p1->Z); /* Z1^2 */\ \ if (a4 != NULL) { \ sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ mul_##field(p3.Y, p3.Y, a4); \ add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ } \ \ mul_##field(p3.Y, p1->Y, p2->Z); \ mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ mul_##field(add.R, p2->Y, p1->Z); \ mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ \ mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ \ add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ \ /* make the choice between addition and doubling */\ is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ /* |p3| and |add| hold all inputs now, |p3| will hold output */\ \ mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ \ sqr_##field(dbl.H, add.H); /* H^2 */\ mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ \ mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ sqr_##field(p3.X, add.R); /* R^2 */\ sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ \ sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ \ vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ } /* * Addition with affine point that can handle doubling [as well as * points at infinity, with |p1| being encoded as Z==0 and |p2| as * X,Y==0] in constant time. But at what additional cost? Best * addition result is 7M+4S, while this routine takes 8M+5S, as per * * -------------+------------- * addition | doubling * -------------+------------- * U1 = X1 | U1 = X2 * U2 = X2*Z1^2 | * S1 = Y1 | S1 = Y2 * S2 = Y2*Z1^3 | * H = U2-X1 | H' = 2*Y2 * R = S2-Y1 | R' = 3*X2^2[+a] * sx = X1+U2 | sx = X2+X2 * zz = H*Z1 | zz = H' * -------------+------------- * H!=0 || R!=0 | H==0 && R==0 * * X3 = R^2-H^2*sx * Y3 = R*(H^2*U1-X3)-H^3*S1 * Z3 = zz * * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. */ #define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ const ptype##_affine *p2) \ { \ ptype p3; /* starts as (,, H*Z1) from addition side */\ struct { vec##bits H, R, sx; } add, dbl; \ bool_t p1inf, p2inf, is_dbl; \ \ p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ sqr_##field(dbl.R, p2->X); /* X2^2 */\ mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ \ p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ sqr_##field(add.H, p1->Z); /* Z1^2 */\ mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ \ mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ \ add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ \ mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ \ /* make the choice between addition and doubling */ \ is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ /* |p3| and |add| hold all inputs now, |p3| will hold output */\ \ sqr_##field(dbl.H, add.H); /* H^2 */\ mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ \ mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ sqr_##field(p3.X, add.R); /* R^2 */\ sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ \ sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ \ vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ } /* * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl * with twist to handle either input at infinity, which are encoded as Z==0. */ #define POINT_ADD_IMPL(ptype, bits, field) \ static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ { \ ptype p3; \ vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \ bool_t p1inf, p2inf; \ \ p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ \ mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ \ p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ \ mul_##field(S1, Z2Z2, p2->Z); /* Z2*Z2Z2 */\ mul_##field(S1, S1, p1->Y); /* S1 = Y1*Z2*Z2Z2 */\ \ sub_##field(p3.Z, p3.Z, S1); /* S2-S1 */\ add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-S1) */\ \ mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ \ sub_##field(H, H, U1); /* H = U2-U1 */\ \ add_##field(I, H, H); /* 2*H */\ sqr_##field(I, I); /* I = (2*H)^2 */\ \ mul_##field(J, H, I); /* J = H*I */\ mul_##field(S1, S1, J); /* S1*J */\ \ mul_##field(p3.Y, U1, I); /* V = U1*I */\ \ sqr_##field(p3.X, p3.Z); /* r^2 */\ sub_##field(p3.X, p3.X, J); /* r^2-J */\ sub_##field(p3.X, p3.X, p3.Y); \ sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ \ sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ sub_##field(p3.Y, p3.Y, S1); \ sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ \ add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ \ vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ } /* * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl * with twist to handle either input at infinity, with |p1| encoded as Z==0, * and |p2| as X==Y==0. */ #define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ static void ptype##_add_affine(ptype *out, const ptype *p1, \ const ptype##_affine *p2) \ { \ ptype p3; \ vec##bits Z1Z1, H, HH, I, J; \ bool_t p1inf, p2inf; \ \ p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ \ sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ \ mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ \ p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ \ mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ sub_##field(H, H, p1->X); /* H = U2-X1 */\ \ sqr_##field(HH, H); /* HH = H^2 */\ add_##field(I, HH, HH); \ add_##field(I, I, I); /* I = 4*HH */\ \ mul_##field(p3.Y, p1->X, I); /* V = X1*I */\ mul_##field(J, H, I); /* J = H*I */\ mul_##field(I, J, p1->Y); /* Y1*J */\ \ sub_##field(p3.Z, p3.Z, p1->Y); /* S2-Y1 */\ add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-Y1) */\ \ sqr_##field(p3.X, p3.Z); /* r^2 */\ sub_##field(p3.X, p3.X, J); /* r^2-J */\ sub_##field(p3.X, p3.X, p3.Y); \ sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ \ sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ sub_##field(p3.Y, p3.Y, I); \ sub_##field(p3.Y, p3.Y, I); /* Y3 = r*(V-X3)-2*Y1*J */\ \ add_##field(p3.Z, p1->Z, H); /* Z1+H */\ sqr_##field(p3.Z, p3.Z); /* (Z1+H)^2 */\ sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ sub_##field(p3.Z, p3.Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ \ vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ } /* * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l */ #define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ static void ptype##_double(ptype *p3, const ptype *p1) \ { \ vec##bits A, B, C; \ \ sqr_##field(A, p1->X); /* A = X1^2 */\ sqr_##field(B, p1->Y); /* B = Y1^2 */\ sqr_##field(C, B); /* C = B^2 */\ \ add_##field(B, B, p1->X); /* X1+B */\ sqr_##field(B, B); /* (X1+B)^2 */\ sub_##field(B, B, A); /* (X1+B)^2-A */\ sub_##field(B, B, C); /* (X1+B)^2-A-C */\ add_##field(B, B, B); /* D = 2*((X1+B)^2-A-C) */\ \ mul_by_3_##field(A, A); /* E = 3*A */\ \ sqr_##field(p3->X, A); /* F = E^2 */\ sub_##field(p3->X, p3->X, B); \ sub_##field(p3->X, p3->X, B); /* X3 = F-2*D */\ \ add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ \ mul_by_8_##field(C, C); /* 8*C */\ sub_##field(p3->Y, B, p3->X); /* D-X3 */\ mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ } #define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ { \ mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ sqr_##field(pxz->Z, p->Z); \ mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ } /* * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 * with twist to handle either input at infinity, which are encoded as Z==0. * Just in case, order of doubling and addition is reverse in comparison to * hyperelliptic.org entry. This was done to minimize temporary storage. * * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. */ #define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ const ptype##xz *p) \ { \ ptype##xz p5; \ vec##bits A, B, C, D, XX, ZZ; \ bool_t r_inf, s_inf; \ /* s += r */\ mul_##field(A, r->X, s->X); /* A = X2*X3 */\ mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ \ sqr_##field(A, A); /* (A[-a*B])^2 */\ add_##field(p5.X, C, D); /* C+D */\ mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ \ sub_##field(p5.Z, C, D); /* C-D */\ sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ \ r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ \ vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ /* r *= 2 */\ sqr_##field(XX, r->X); /* XX = X2^2 */\ sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ \ add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ \ sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ mul_##field(B, r->Z, ZZ); /* E*ZZ */\ mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ \ sqr_##field(ZZ, ZZ); /* ZZ^2 */\ mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ } /* * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist * and conversion to Jacobian coordinates from /.../ecp_smpl.c, * and with twist to recover from |s| at infinity [which occurs when * multiplying by (order-1)]. * * X4 = 2*Y1*X2*Z3*Z1*Z2 * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 * Z4 = 2*Y1*Z3*Z2^2*Z1 * * Z3x2 = 2*Z3 * Y1Z3x2 = Y1*Z3x2 * Z1Z2 = Z1*Z2 * X1Z2 = X1*Z2 * X2Z1 = X2*Z1 * X4 = Y1Z3x2*X2*Z1Z2 * A = b*Z3x2*(Z1Z2)^2 * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) * C = X3*(X1Z2-X2Z1)^2 * Y4 = A+B-C * Z4 = Y1Z3x2*Z1Z2*Z2 * * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. */ #define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ static void ptype##xz_ladder_post(ptype *p4, \ const ptype##xz *r, const ptype##xz *s, \ const ptype##xz *p, const vec##bits Y1) \ { \ vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ bool_t s_inf; \ \ add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ \ mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ \ sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ \ mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ \ sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ \ add_##field(A, A, B); /* A+B */\ sub_##field(A, A, C); /* Y4 = A+B-C */\ \ mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ \ s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ ptype##_cneg(p4, s_inf); \ /* to Jacobian */\ mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ sqr_##field(B, p4->Z); \ mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ } #define POINT_IS_EQUAL_IMPL(ptype, bits, field) \ static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \ { \ vec##bits Z1Z1, Z2Z2; \ ptype##_affine a1, a2; \ bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \ bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \ \ sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ \ mul_##field(a1.X, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ mul_##field(a2.X, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ \ mul_##field(a1.Y, p1->Y, p2->Z); /* Y1*Z2 */\ mul_##field(a2.Y, p2->Y, p1->Z); /* Y2*Z1 */\ \ mul_##field(a1.Y, a1.Y, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ mul_##field(a2.Y, a2.Y, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ \ return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \ } /* * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5| * and replacing few first references to |X3| in the formula, up to step * 21, with it. 12M[+27A], doubling and infinity are handled by the * formula itself. Infinity is to be encoded as [0, !0, 0]. */ #define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \ static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \ const ptype##proj *p2) \ { \ vec##bits t0, t1, t2, t3, t4, t5; \ \ mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ mul_##field(t2, p1->Z, p2->Z); /* 3. t2 = Z1*Z2 */\ add_##field(t3, p1->X, p1->Y); /* 4. t3 = X1+Y1 */\ add_##field(t4, p2->X, p2->Y); /* 5. t4 = X2+Y2 */\ mul_##field(t3, t3, t4); /* 6. t3 = t3*t4 */\ add_##field(t4, t0, t1); /* 7. t4 = t0+t1 */\ sub_##field(t3, t3, t4); /* 8. t3 = t3-t4 */\ add_##field(t4, p1->Y, p1->Z); /* 9. t4 = Y1+Z1 */\ add_##field(t5, p2->Y, p2->Z); /* 10. t5 = Y2+Z2 */\ mul_##field(t4, t4, t5); /* 11. t4 = t4*t5 */\ add_##field(t5, t1, t2); /* 12. t5 = t1+t2 */\ sub_##field(t4, t4, t5); /* 13. t4 = t4-t5 */\ add_##field(t5, p1->X, p1->Z); /* 14. t5 = X1+Z1 */\ add_##field(p3->Y, p2->X, p2->Z); /* 15. Y3 = X2+Z2 */\ mul_##field(t5, t5, p3->Y); /* 16. t5 = t5*Y3 */\ add_##field(p3->Y, t0, t2); /* 17. Y3 = t0+t2 */\ sub_##field(p3->Y, t5, p3->Y); /* 18. Y3 = t5-Y3 */\ mul_by_3_##field(t0, t0); /* 19-20. t0 = 3*t0 */\ mul_by_3_##field(t5, t2); /* 21. t5 = 3*t2 */\ mul_by_b_##suffixb(t2, t5); /* 21. t2 = b*t5 */\ add_##field(p3->Z, t1, t2); /* 22. Z3 = t1+t2 */\ sub_##field(t1, t1, t2); /* 23. t1 = t1-t2 */\ mul_by_3_##field(t5, p3->Y); /* 24. t5 = 3*Y3 */\ mul_by_b_##suffixb(p3->Y, t5); /* 24. Y3 = b*t5 */\ mul_##field(p3->X, t4, p3->Y); /* 25. X3 = t4*Y3 */\ mul_##field(t2, t3, t1); /* 26. t2 = t3*t1 */\ sub_##field(p3->X, t2, p3->X); /* 27. X3 = t2-X3 */\ mul_##field(p3->Y, p3->Y, t0); /* 28. Y3 = Y3*t0 */\ mul_##field(t1, t1, p3->Z); /* 29. t1 = t1*Z3 */\ add_##field(p3->Y, t1, p3->Y); /* 30. Y3 = t1+Y3 */\ mul_##field(t0, t0, t3); /* 31. t0 = t0*t3 */\ mul_##field(p3->Z, p3->Z, t4); /* 32. Z3 = Z3*t4 */\ add_##field(p3->Z, p3->Z, t0); /* 33. Z3 = Z3+t0 */\ } /* * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle * |p2| being infinity encoded as [0, 0]. 11M[+21A]. */ #define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \ static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \ const ptype##_affine *p2) \ { \ ptype##proj p3[1]; \ vec##bits t0, t1, t2, t3, t4; \ limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \ \ mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ add_##field(t3, p1->X, p1->Y); /* 3. t3 = X1+Y1 */\ add_##field(t4, p2->X, p2->Y); /* 4. t4 = X2+Y2 */\ mul_##field(t3, t3, t4); /* 5. t3 = t3*t4 */\ add_##field(t4, t0, t1); /* 6. t4 = t0+t1 */\ sub_##field(t3, t3, t4); /* 7. t3 = t3-t4 */\ mul_##field(t4, p2->Y, p1->Z); /* 8. t4 = Y2*Z1 */\ add_##field(t4, t4, p1->Y); /* 9. t4 = t4+Y1 */\ mul_##field(p3->Y, p2->X, p1->Z); /* 10. Y3 = X2*Z1 */\ add_##field(p3->Y, p3->Y, p1->X); /* 11. Y3 = Y3+X1 */\ mul_by_3_##field(t0, t0); /* 12-13. t0 = 3*t0 */\ mul_by_b_##suffixb(t2, p1->Z); /* 14. t2 = b*Z1 */\ mul_by_3_##field(t2, t2); /* 14. t2 = 3*t2 */\ add_##field(p3->Z, t1, t2); /* 15. Z3 = t1+t2 */\ sub_##field(t1, t1, t2); /* 16. t1 = t1-t2 */\ mul_by_b_##suffixb(t2, p3->Y); /* 17. t2 = b*Y3 */\ mul_by_3_##field(p3->Y, t2); /* 17. Y3 = 3*t2 */\ mul_##field(p3->X, t4, p3->Y); /* 18. X3 = t4*Y3 */\ mul_##field(t2, t3, t1); /* 19. t2 = t3*t1 */\ sub_##field(p3->X, t2, p3->X); /* 20. X3 = t2-X3 */\ mul_##field(p3->Y, p3->Y, t0); /* 21. Y3 = Y3*t0 */\ mul_##field(t1, t1, p3->Z); /* 22. t1 = t1*Z3 */\ add_##field(p3->Y, t1, p3->Y); /* 23. Y3 = t1+Y3 */\ mul_##field(t0, t0, t3); /* 24. t0 = t0*t3 */\ mul_##field(p3->Z, p3->Z, t4); /* 25. Z3 = Z3*t4 */\ add_##field(p3->Z, p3->Z, t0); /* 26. Z3 = Z3+t0 */\ \ vec_select(out, p1, p3, sizeof(*out), p2inf); \ } /* * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y * and reordering operations to bring references to |p1| forward. * 6M+2S[+13A]. */ #define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \ static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \ { \ vec##bits t0, t1, t2, t3; \ \ sqr_##field(t0, p1->Y); /* 1. t0 = Y*Y */\ mul_##field(t1, p1->Y, p1->Z); /* 5. t1 = Y*Z */\ sqr_##field(t2, p1->Z); /* 6. t2 = Z*Z */\ mul_##field(t3, p1->X, p1->Y); /* 16. t3 = X*Y */\ lshift_##field(p3->Z, t0, 3); /* 2-4. Z3 = 8*t0 */\ mul_by_b_##suffixb(p3->X, t2); /* 7. t2 = b*t2 */\ mul_by_3_##field(t2, p3->X); /* 7. t2 = 3*t2 */\ mul_##field(p3->X, t2, p3->Z); /* 8. X3 = t2*Z3 */\ add_##field(p3->Y, t0, t2); /* 9. Y3 = t0+t2 */\ mul_##field(p3->Z, t1, p3->Z); /* 10. Z3 = t1*Z3 */\ mul_by_3_##field(t2, t2); /* 11-12. t2 = 3*t2 */\ sub_##field(t0, t0, t2); /* 13. t0 = t0-t2 */\ mul_##field(p3->Y, t0, p3->Y); /* 14. Y3 = t0*Y3 */\ add_##field(p3->Y, p3->X, p3->Y); /* 15. Y3 = X3+Y3 */\ mul_##field(p3->X, t0, t3); /* 17. X3 = t0*t3 */\ add_##field(p3->X, p3->X, p3->X); /* 18. X3 = X3+X3 */\ } #define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \ static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \ { \ vec##bits ZZ; \ \ sqr_##field(ZZ, in->Z); \ mul_##field(out->X, in->X, in->Z); \ mul_##field(out->Y, in->Y, ZZ); \ vec_copy(out->Z, in->Z, sizeof(out->Z)); \ } #define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \ static void ptype##_to_projective(ptype##proj *out, const ptype *in) \ { \ vec##bits ZZ; \ limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \ \ sqr_##field(ZZ, in->Z); \ mul_##field(out->X, in->X, in->Z); \ vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \ mul_##field(out->Z, ZZ, in->Z); \ } /******************* !!!!! NOT CONSTANT TIME !!!!! *******************/ /* * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 * with twist to handle either input at infinity. Addition costs 12M+2S, * while conditional doubling - 4M+6M+3S. */ #define POINTXYZZ_DADD_IMPL(ptype, bits, field) \ static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \ const ptype##xyzz *p2) \ { \ vec##bits U, S, P, R; \ \ if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \ vec_copy(p3, p1, sizeof(*p3)); \ return; \ } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ vec_copy(p3, p2, sizeof(*p3)); \ return; \ } \ \ mul_##field(U, p1->X, p2->ZZ); /* U1 = X1*ZZ2 */\ mul_##field(S, p1->Y, p2->ZZZ); /* S1 = Y1*ZZZ2 */\ mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ sub_##field(P, P, U); /* P = U2-U1 */\ sub_##field(R, R, S); /* R = S2-S1 */\ \ if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ vec##bits PP, PPP, Q; /* add |p1| and |p2| */\ \ sqr_##field(PP, P); /* PP = P^2 */\ mul_##field(PPP, PP, P); /* PPP = P*PP */\ mul_##field(Q, U, PP); /* Q = U1*PP */\ sqr_##field(p3->X, R); /* R^2 */\ add_##field(P, Q, Q); \ sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ sub_##field(Q, Q, p3->X); \ mul_##field(Q, Q, R); /* R*(Q-X3) */\ mul_##field(p3->Y, S, PPP); /* S1*PPP */\ sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-S1*PPP */\ mul_##field(p3->ZZ, p1->ZZ, p2->ZZ); /* ZZ1*ZZ2 */\ mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\ mul_##field(p3->ZZ, p3->ZZ, PP); /* ZZ3 = ZZ1*ZZ2*PP */\ mul_##field(p3->ZZZ, p3->ZZZ, PPP); /* ZZZ3 = ZZZ1*ZZZ2*PPP */\ } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ vec##bits V, W, M; /* double |p1| */\ \ add_##field(U, p1->Y, p1->Y); /* U = 2*Y1 */\ sqr_##field(V, U); /* V = U^2 */\ mul_##field(W, V, U); /* W = U*V */\ mul_##field(S, p1->X, V); /* S = X1*V */\ sqr_##field(M, p1->X); \ mul_by_3_##field(M, M); /* M = 3*X1^2[+a*ZZ1^2] */\ sqr_##field(p3->X, M); \ add_##field(U, S, S); /* 2*S */\ sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ mul_##field(p3->Y, W, p1->Y); /* W*Y1 */\ sub_##field(S, S, p3->X); \ mul_##field(S, S, M); /* M*(S-X3) */\ sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ mul_##field(p3->ZZ, p1->ZZ, V); /* ZZ3 = V*ZZ1 */\ mul_##field(p3->ZZZ, p1->ZZZ, W); /* ZZ3 = W*ZZZ1 */\ } else { /* X1==X2 && Y1==-Y2 */\ vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ } \ } /* * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1 * with twists to handle even subtractions and either input at infinity. * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. */ #define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \ static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \ const ptype##_affine *p2, \ bool_t subtract) \ { \ vec##bits P, R; \ \ if (vec_is_zero(p2, sizeof(*p2))) { \ vec_copy(p3, p1, sizeof(*p3)); \ return; \ } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\ cneg_##field(p3->ZZZ, one, subtract); \ vec_copy(p3->ZZ, one, sizeof(p3->ZZ)); \ return; \ } \ \ mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ cneg_##field(R, R, subtract); \ sub_##field(P, P, p1->X); /* P = U2-X1 */\ sub_##field(R, R, p1->Y); /* R = S2-Y1 */\ \ if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ vec##bits PP, PPP, Q; /* add |p2| to |p1| */\ \ sqr_##field(PP, P); /* PP = P^2 */\ mul_##field(PPP, PP, P); /* PPP = P*PP */\ mul_##field(Q, p1->X, PP); /* Q = X1*PP */\ sqr_##field(p3->X, R); /* R^2 */\ add_##field(P, Q, Q); \ sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ sub_##field(Q, Q, p3->X); \ mul_##field(Q, Q, R); /* R*(Q-X3) */\ mul_##field(p3->Y, p1->Y, PPP); /* Y1*PPP */\ sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-Y1*PPP */\ mul_##field(p3->ZZ, p1->ZZ, PP); /* ZZ3 = ZZ1*PP */\ mul_##field(p3->ZZZ, p1->ZZZ, PPP); /* ZZZ3 = ZZZ1*PPP */\ } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ vec##bits U, S, M; /* double |p2| */\ \ add_##field(U, p2->Y, p2->Y); /* U = 2*Y1 */\ sqr_##field(p3->ZZ, U); /* [ZZ3 =] V = U^2 */\ mul_##field(p3->ZZZ, p3->ZZ, U); /* [ZZZ3 =] W = U*V */\ mul_##field(S, p2->X, p3->ZZ); /* S = X1*V */\ sqr_##field(M, p2->X); \ mul_by_3_##field(M, M); /* M = 3*X1^2[+a] */\ sqr_##field(p3->X, M); \ add_##field(U, S, S); /* 2*S */\ sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ mul_##field(p3->Y, p3->ZZZ, p2->Y); /* W*Y1 */\ sub_##field(S, S, p3->X); \ mul_##field(S, S, M); /* M*(S-X3) */\ sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \ } else { /* X1==X2 && Y1==-Y2 */\ vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ } \ } #define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \ static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \ { \ mul_##field(out->X, in->X, in->ZZ); \ mul_##field(out->Y, in->Y, in->ZZZ); \ vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \ } #define POINT_TO_XYZZ_IMPL(ptype, bits, field) \ static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \ { \ vec_copy(out->X, in->X, 2*sizeof(out->X)); \ sqr_##field(out->ZZ, in->Z); \ mul_##field(out->ZZZ, out->ZZ, in->Z); \ } #endif ================================================ FILE: src/errors.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_381_ASM_ERRORS_H__ #define __BLS12_381_ASM_ERRORS_H__ typedef enum { BLST_SUCCESS = 0, BLST_BAD_ENCODING, BLST_POINT_NOT_ON_CURVE, BLST_POINT_NOT_IN_GROUP, BLST_AGGR_TYPE_MISMATCH, BLST_VERIFY_FAIL, BLST_PK_IS_INFINITY, } BLST_ERROR; #endif ================================================ FILE: src/exp.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "vect.h" /* * |out| = |inp|^|pow|, small footprint, public exponent */ static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, size_t pow_bits, const vec384 p, limb_t n0) { #if 1 vec384 ret; vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ --pow_bits; /* most significant bit is set, skip over */ while (pow_bits--) { sqr_mont_384(ret, ret, p, n0); if (is_bit_set(pow, pow_bits)) mul_mont_384(ret, ret, inp, p, n0); } vec_copy(out, ret, sizeof(ret)); /* out = ret */ #else unsigned int i; vec384 sqr; vec_copy(sqr, inp, sizeof(sqr)); for (i = 0; !is_bit_set(pow, i++);) sqr_mont_384(sqr, sqr, sqr, p, n0); vec_copy(out, sqr, sizeof(sqr)); for (; i < pow_bits; i++) { sqr_mont_384(sqr, sqr, sqr, p, n0); if (is_bit_set(pow, i)) mul_mont_384(out, out, sqr, p, n0); } #endif } static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, size_t pow_bits, const vec384 p, limb_t n0) { vec384x ret; vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ --pow_bits; /* most significant bit is accounted for, skip over */ while (pow_bits--) { sqr_mont_384x(ret, ret, p, n0); if (is_bit_set(pow, pow_bits)) mul_mont_384x(ret, ret, inp, p, n0); } vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ } ================================================ FILE: src/exports.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ /* * Why this file? Overall goal is to ensure that all internal calls * remain internal after linking application. This is to both * * a) minimize possibility of external name conflicts (since all * non-blst-prefixed and [assembly subroutines] remain static); * b) preclude possibility of unintentional internal reference * overload in shared library context (one can achieve same * effect with -Bsymbolic, but we don't want to rely on end-user * to remember to use it); */ #include "fields.h" #include "bytes.h" /* * BLS12-381-specific Fr shortcuts to assembly. */ void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) { add_mod_256(ret, a, b, BLS12_381_r); } void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) { sub_mod_256(ret, a, b, BLS12_381_r); } void blst_fr_mul_by_3(vec256 ret, const vec256 a) { mul_by_3_mod_256(ret, a, BLS12_381_r); } void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) { lshift_mod_256(ret, a, count, BLS12_381_r); } void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) { rshift_mod_256(ret, a, count, BLS12_381_r); } void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) { mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } void blst_fr_ct_bfly(vec256 x0, vec256 x1, const vec256 twiddle) { vec256 x2; mul_mont_sparse_256(x2, x1, twiddle, BLS12_381_r, r0); sub_mod_256(x1, x0, x2, BLS12_381_r); add_mod_256(x0, x0, x2, BLS12_381_r); } void blst_fr_gs_bfly(vec256 x0, vec256 x1, const vec256 twiddle) { vec256 x2; sub_mod_256(x2, x0, x1, BLS12_381_r); add_mod_256(x0, x0, x1, BLS12_381_r); mul_mont_sparse_256(x1, x2, twiddle, BLS12_381_r, r0); } void blst_fr_sqr(vec256 ret, const vec256 a) { sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } void blst_fr_cneg(vec256 ret, const vec256 a, int flag) { cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r); } void blst_fr_to(vec256 ret, const vec256 a) { mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } void blst_fr_from(vec256 ret, const vec256 a) { from_mont_256(ret, a, BLS12_381_r, r0); } void blst_fr_from_scalar(vec256 ret, const pow256 a) { const union { long one; char little; } is_endian = { 1 }; if ((uptr_t)ret == (uptr_t)a && is_endian.little) { mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); } else { vec256 out; limbs_from_le_bytes(out, a, 32); mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0); vec_zero(out, sizeof(out)); } } void blst_scalar_from_fr(pow256 ret, const vec256 a) { const union { long one; char little; } is_endian = { 1 }; if ((uptr_t)ret == (uptr_t)a && is_endian.little) { from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); } else { vec256 out; from_mont_256(out, a, BLS12_381_r, r0); le_bytes_from_limbs(ret, out, 32); vec_zero(out, sizeof(out)); } } int blst_scalar_fr_check(const pow256 a) { return (int)(check_mod_256(a, BLS12_381_r) | bytes_are_zero(a, sizeof(pow256))); } int blst_sk_check(const pow256 a) { return (int)check_mod_256(a, BLS12_381_r); } int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b) { return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r); } int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) { return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r); } int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) { vec256 t[2]; const union { long one; char little; } is_endian = { 1 }; bool_t is_zero; if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { limbs_from_le_bytes(t[0], a, sizeof(pow256)); limbs_from_le_bytes(t[1], b, sizeof(pow256)); a = (const byte *)t[0]; b = (const byte *)t[1]; } mul_mont_sparse_256(t[0], BLS12_381_rRR, (const limb_t *)a, BLS12_381_r, r0); mul_mont_sparse_256(t[0], t[0], (const limb_t *)b, BLS12_381_r, r0); le_bytes_from_limbs(ret, t[0], sizeof(pow256)); is_zero = vec_is_zero(t[0], sizeof(vec256)); vec_zero(t, sizeof(t)); return (int)(is_zero^1); } void blst_sk_inverse(pow256 ret, const pow256 a) { const union { long one; char little; } is_endian = { 1 }; if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) { limb_t *out = (limb_t *)ret; mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); reciprocal_fr(out, out); from_mont_256(out, out, BLS12_381_r, r0); } else { vec256 out; limbs_from_le_bytes(out, a, 32); mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); reciprocal_fr(out, out); from_mont_256(out, out, BLS12_381_r, r0); le_bytes_from_limbs(ret, out, 32); vec_zero(out, sizeof(out)); } } /* * BLS12-381-specific Fp shortcuts to assembly. */ void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) { add_fp(ret, a, b); } void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) { sub_fp(ret, a, b); } void blst_fp_mul_by_3(vec384 ret, const vec384 a) { mul_by_3_fp(ret, a); } void blst_fp_mul_by_8(vec384 ret, const vec384 a) { mul_by_8_fp(ret, a); } void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) { lshift_fp(ret, a, count); } void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) { mul_fp(ret, a, b); } void blst_fp_sqr(vec384 ret, const vec384 a) { sqr_fp(ret, a); } void blst_fp_cneg(vec384 ret, const vec384 a, int flag) { cneg_fp(ret, a, is_zero(flag) ^ 1); } void blst_fp_to(vec384 ret, const vec384 a) { mul_fp(ret, a, BLS12_381_RR); } void blst_fp_from(vec384 ret, const vec384 a) { from_fp(ret, a); } /* * Fp serialization/deserialization. */ void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) { if (sizeof(limb_t) == 8) { int i; for (i = 0; i < 6; i++) ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1))); a = (const unsigned int *)ret; } mul_fp(ret, (const limb_t *)a, BLS12_381_RR); } void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) { if (sizeof(limb_t) == 4) { from_fp((limb_t *)ret, a); } else { vec384 out; int i; from_fp(out, a); for (i = 0; i < 6; i++) { limb_t limb = out[i]; ret[2*i] = (unsigned int)limb; ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1))); } } } void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) { const union { long one; char little; } is_endian = { 1 }; if (sizeof(limb_t) == 4 && !is_endian.little) { int i; for (i = 0; i < 6; i++) { unsigned long long limb = a[i]; ret[2*i] = (limb_t)limb; ret[2*i+1] = (limb_t)(limb >> 32); } a = (const unsigned long long *)ret; } mul_fp(ret, (const limb_t *)a, BLS12_381_RR); } void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) { const union { long one; char little; } is_endian = { 1 }; if (sizeof(limb_t) == 8 || is_endian.little) { from_fp((limb_t *)ret, a); } else { vec384 out; int i; from_fp(out, a); for (i = 0; i < 6; i++) ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); } } void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) { vec384 out; limbs_from_be_bytes(out, a, sizeof(vec384)); mul_fp(ret, out, BLS12_381_RR); } void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) { vec384 out; from_fp(out, a); be_bytes_from_limbs(ret, out, sizeof(vec384)); } void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) { vec384 out; limbs_from_le_bytes(out, a, sizeof(vec384)); mul_fp(ret, out, BLS12_381_RR); } void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) { vec384 out; from_fp(out, a); le_bytes_from_limbs(ret, out, sizeof(vec384)); } /* * BLS12-381-specific Fp2 shortcuts to assembly. */ void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) { add_fp2(ret, a, b); } void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) { sub_fp2(ret, a, b); } void blst_fp2_mul_by_3(vec384x ret, const vec384x a) { mul_by_3_fp2(ret, a); } void blst_fp2_mul_by_8(vec384x ret, const vec384x a) { mul_by_8_fp2(ret, a); } void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) { lshift_fp2(ret, a, count); } void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) { mul_fp2(ret, a, b); } void blst_fp2_sqr(vec384x ret, const vec384x a) { sqr_fp2(ret, a); } void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) { cneg_fp2(ret, a, is_zero(flag) ^ 1); } /* * Scalar serialization/deserialization. */ void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) { const union { long one; char little; } is_endian = { 1 }; size_t i; if ((uptr_t)ret==(uptr_t)a && is_endian.little) return; for(i = 0; i < 8; i++) { unsigned int w = a[i]; *ret++ = (byte)w; *ret++ = (byte)(w >> 8); *ret++ = (byte)(w >> 16); *ret++ = (byte)(w >> 24); } } void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a) { const union { long one; char little; } is_endian = { 1 }; size_t i; if ((uptr_t)ret==(uptr_t)a && is_endian.little) return; for(i = 0; i < 8; i++) { unsigned int w = (unsigned int)(*a++); w |= (unsigned int)(*a++) << 8; w |= (unsigned int)(*a++) << 16; w |= (unsigned int)(*a++) << 24; ret[i] = w; } } void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4]) { const union { long one; char little; } is_endian = { 1 }; size_t i; if ((uptr_t)ret==(uptr_t)a && is_endian.little) return; for(i = 0; i < 4; i++) { unsigned long long w = a[i]; *ret++ = (byte)w; *ret++ = (byte)(w >> 8); *ret++ = (byte)(w >> 16); *ret++ = (byte)(w >> 24); *ret++ = (byte)(w >> 32); *ret++ = (byte)(w >> 40); *ret++ = (byte)(w >> 48); *ret++ = (byte)(w >> 56); } } void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a) { const union { long one; char little; } is_endian = { 1 }; size_t i; if ((uptr_t)ret==(uptr_t)a && is_endian.little) return; for(i = 0; i < 4; i++) { unsigned long long w = (unsigned long long)(*a++); w |= (unsigned long long)(*a++) << 8; w |= (unsigned long long)(*a++) << 16; w |= (unsigned long long)(*a++) << 24; w |= (unsigned long long)(*a++) << 32; w |= (unsigned long long)(*a++) << 40; w |= (unsigned long long)(*a++) << 48; w |= (unsigned long long)(*a++) << 56; ret[i] = w; } } void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32]) { vec256 out; limbs_from_be_bytes(out, a, sizeof(out)); le_bytes_from_limbs(ret, out, sizeof(out)); vec_zero(out, sizeof(out)); } void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a) { vec256 out; limbs_from_le_bytes(out, a, sizeof(out)); be_bytes_from_limbs(ret, out, sizeof(out)); vec_zero(out, sizeof(out)); } void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32]) { size_t i; if ((uptr_t)ret==(uptr_t)a) return; for (i = 0; i < 32; i++) ret[i] = a[i]; } void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a) { size_t i; if ((uptr_t)ret==(uptr_t)a) return; for (i = 0; i < 32; i++) ret[i] = a[i]; } void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4]) { const union { long one; char little; } is_endian = { 1 }; if (sizeof(limb_t) == 4 && !is_endian.little) { int i; for (i = 0; i < 4; i++) { unsigned long long limb = a[i]; ret[2*i] = (limb_t)limb; ret[2*i+1] = (limb_t)(limb >> 32); } a = (const unsigned long long *)ret; } mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); } void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) { const union { long one; char little; } is_endian = { 1 }; if (sizeof(limb_t) == 8 || is_endian.little) { from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); } else { vec256 out; int i; from_mont_256(out, a, BLS12_381_r, r0); for (i = 0; i < 4; i++) ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); vec_zero(out, sizeof(out)); } } int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) { size_t rem = n ? ((n - 1) % 32 + 1) : 0; struct { vec256 out, digit; } t; limb_t ret; vec_zero(t.out, sizeof(t.out)); n -= rem; limbs_from_le_bytes(t.out, bytes += n, rem); mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); while (n) { limbs_from_le_bytes(t.digit, bytes -= 32, 32); add_mod_256(t.out, t.out, t.digit, BLS12_381_r); mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); n -= 32; } from_mont_256(t.out, t.out, BLS12_381_r, r0); ret = vec_is_zero(t.out, sizeof(t.out)); le_bytes_from_limbs(out, t.out, 32); vec_zero(&t, sizeof(t)); return (int)(ret^1); } int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) { size_t rem = n ? ((n - 1) % 32 + 1) : 0; struct { vec256 out, digit; } t; limb_t ret; vec_zero(t.out, sizeof(t.out)); limbs_from_be_bytes(t.out, bytes, rem); mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); while (n -= rem) { limbs_from_be_bytes(t.digit, bytes += rem, 32); add_mod_256(t.out, t.out, t.digit, BLS12_381_r); mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); rem = 32; } from_mont_256(t.out, t.out, BLS12_381_r, r0); ret = vec_is_zero(t.out, sizeof(t.out)); le_bytes_from_limbs(out, t.out, 32); vec_zero(&t, sizeof(t)); return (int)(ret^1); } void blst_fp_from_le_bytes(vec384 out, const unsigned char *bytes, size_t n) { size_t rem = n ? ((n - 1) % 48 + 1) : 0; vec384 digit; vec_zero(out, sizeof(vec384)); n -= rem; limbs_from_le_bytes(out, bytes += n, rem); mul_mont_384(out, BLS12_381_RR, out, BLS12_381_P, p0); while (n) { limbs_from_le_bytes(digit, bytes -= 48, 48); add_mod_384(out, out, digit, BLS12_381_P); mul_mont_384(out, BLS12_381_RR, out, BLS12_381_P, p0); n -= 48; } } void blst_fp_from_be_bytes(vec384 out, const unsigned char *bytes, size_t n) { size_t rem = n ? ((n - 1) % 48 + 1) : 0; vec384 digit; vec_zero(out, sizeof(vec384)); limbs_from_be_bytes(out, bytes, rem); mul_mont_384(out, BLS12_381_RR, out, BLS12_381_P, p0); while (n -= rem) { limbs_from_be_bytes(digit, bytes += rem, 48); add_mod_384(out, out, digit, BLS12_381_P); mul_mont_384(out, BLS12_381_RR, out, BLS12_381_P, p0); rem = 48; } } /* * Single-short SHA-256 hash function. */ #include "sha256.h" void blst_sha256(unsigned char md[32], const void *msg, size_t len) { SHA256_CTX ctx; sha256_init(&ctx); sha256_update(&ctx, msg, len); sha256_final(md, &ctx); } /* * Test facilitator. */ void blst_scalar_from_hexascii(pow256 ret, const char *hex) { bytes_from_hexascii(ret, sizeof(pow256), hex); } void blst_fr_from_hexascii(vec256 ret, const char *hex) { limbs_from_hexascii(ret, sizeof(vec256), hex); mul_mont_sparse_256(ret, ret, BLS12_381_rRR, BLS12_381_r, r0); } void blst_fp_from_hexascii(vec384 ret, const char *hex) { limbs_from_hexascii(ret, sizeof(vec384), hex); mul_fp(ret, ret, BLS12_381_RR); } ================================================ FILE: src/fields.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_381_ASM_FIELDS_H__ #define __BLS12_381_ASM_FIELDS_H__ #include "vect.h" #include "consts.h" /* * BLS12-381-specific Fp shortcuts to assembly. */ static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) { add_mod_384(ret, a, b, BLS12_381_P); } static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) { sub_mod_384(ret, a, b, BLS12_381_P); } static inline void mul_by_3_fp(vec384 ret, const vec384 a) { mul_by_3_mod_384(ret, a, BLS12_381_P); } static inline void mul_by_8_fp(vec384 ret, const vec384 a) { mul_by_8_mod_384(ret, a, BLS12_381_P); } static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) { lshift_mod_384(ret, a, count, BLS12_381_P); } static inline void rshift_fp(vec384 ret, const vec384 a, size_t count) { rshift_mod_384(ret, a, count, BLS12_381_P); } static inline void div_by_2_fp(vec384 ret, const vec384 a) { div_by_2_mod_384(ret, a, BLS12_381_P); } static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) { mul_mont_384(ret, a, b, BLS12_381_P, p0); } static inline void sqr_fp(vec384 ret, const vec384 a) { sqr_mont_384(ret, a, BLS12_381_P, p0); } static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag) { cneg_mod_384(ret, a, flag, BLS12_381_P); } static inline void from_fp(vec384 ret, const vec384 a) { from_mont_384(ret, a, BLS12_381_P, p0); } static inline void redc_fp(vec384 ret, const vec768 a) { redc_mont_384(ret, a, BLS12_381_P, p0); } /* * BLS12-381-specific Fp2 shortcuts to assembly. */ static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) { add_mod_384x(ret, a, b, BLS12_381_P); } static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) { sub_mod_384x(ret, a, b, BLS12_381_P); } static inline void mul_by_3_fp2(vec384x ret, const vec384x a) { mul_by_3_mod_384x(ret, a, BLS12_381_P); } static inline void mul_by_8_fp2(vec384x ret, const vec384x a) { mul_by_8_mod_384x(ret, a, BLS12_381_P); } static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) { lshift_mod_384(ret[0], a[0], count, BLS12_381_P); lshift_mod_384(ret[1], a[1], count, BLS12_381_P); } static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) { mul_mont_384x(ret, a, b, BLS12_381_P, p0); } static inline void sqr_fp2(vec384x ret, const vec384x a) { sqr_mont_384x(ret, a, BLS12_381_P, p0); } static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag) { cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); } #define vec_load_global vec_copy static void reciprocal_fp(vec384 out, const vec384 inp); static void flt_reciprocal_fp(vec384 out, const vec384 inp); static bool_t recip_sqrt_fp(vec384 out, const vec384 inp); static bool_t sqrt_fp(vec384 out, const vec384 inp); static void reciprocal_fp2(vec384x out, const vec384x inp); static void flt_reciprocal_fp2(vec384x out, const vec384x inp); static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, const vec384x recip_ZZZ, const vec384x magic_ZZZ); static bool_t sqrt_fp2(vec384x out, const vec384x inp); static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, const vec384x sqrt, const vec384x inp); typedef vec384x vec384fp2; typedef vec384fp2 vec384fp6[3]; typedef vec384fp6 vec384fp12[2]; static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp6 xy00z0); static void conjugate_fp12(vec384fp12 a); static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); /* caveat lector! |n| has to be non-zero and not more than 3! */ static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); #define neg_fp(r,a) cneg_fp((r),(a),1) #define neg_fp2(r,a) cneg_fp2((r),(a),1) #endif /* __BLS12_381_ASM_FIELDS_H__ */ ================================================ FILE: src/fp12_tower.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "fields.h" /* * Fp2 = Fp[u] / (u^2 + 1) * Fp6 = Fp2[v] / (v^3 - u - 1) * Fp12 = Fp6[w] / (w^2 - v) */ static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) { mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } #if 1 && !defined(__BLST_NO_ASM__) #define __FP2x2__ /* * Fp2x2 is a "widened" version of Fp2, which allows to consolidate * reductions from several multiplications. In other words instead of * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter * addition is double-width... To be more specific this gives ~7-10% * faster pairing depending on platform... */ typedef vec768 vec768x[2]; static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) { add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); } static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) { sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); } static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) { /* caveat lector! |ret| may not be same as |a| */ sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); } static inline void redc_fp2x2(vec384x ret, const vec768x a) { redc_mont_384(ret[0], a[0], BLS12_381_P, p0); redc_mont_384(ret[1], a[1], BLS12_381_P, p0); } static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) { #if 1 mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ #else union { vec384 x[2]; vec768 x2; } t; add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); mul_384(ret[1], t.x[0], t.x[1]); mul_384(ret[0], a[0], b[0]); mul_384(t.x2, a[1], b[1]); sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); #endif } static void sqr_fp2x2(vec768x ret, const vec384x a) { #if 1 sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ #else vec384 t0, t1; add_mod_384(t0, a[0], a[1], BLS12_381_P); sub_mod_384(t1, a[0], a[1], BLS12_381_P); mul_384(ret[1], a[0], a[1]); add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); mul_384(ret[0], t0, t1); #endif } #endif /* __FP2x2__ */ /* * Fp6 extension */ #if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ typedef vec768x vec768fp6[3]; static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, const vec768fp6 b) { sub_fp2x2(ret[0], a[0], b[0]); sub_fp2x2(ret[1], a[1], b[1]); sub_fp2x2(ret[2], a[2], b[2]); } static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) { vec768x t0, t1, t2; vec384x aa, bb; mul_fp2x2(t0, a[0], b[0]); mul_fp2x2(t1, a[1], b[1]); mul_fp2x2(t2, a[2], b[2]); /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ add_fp2(aa, a[1], a[2]); add_fp2(bb, b[1], b[2]); mul_fp2x2(ret[0], aa, bb); sub_fp2x2(ret[0], ret[0], t1); sub_fp2x2(ret[0], ret[0], t2); mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ add_fp2x2(ret[0], ret[1], t0); /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) = a0*b1 + a1*b0 + a2*b2*(u+1) */ add_fp2(aa, a[0], a[1]); add_fp2(bb, b[0], b[1]); mul_fp2x2(ret[1], aa, bb); sub_fp2x2(ret[1], ret[1], t0); sub_fp2x2(ret[1], ret[1], t1); mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ add_fp2x2(ret[1], ret[1], ret[2]); /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 = a0*b2 + a2*b0 + a1*b1 */ add_fp2(aa, a[0], a[2]); add_fp2(bb, b[0], b[2]); mul_fp2x2(ret[2], aa, bb); sub_fp2x2(ret[2], ret[2], t0); sub_fp2x2(ret[2], ret[2], t2); add_fp2x2(ret[2], ret[2], t1); } static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) { redc_fp2x2(ret[0], a[0]); redc_fp2x2(ret[1], a[1]); redc_fp2x2(ret[2], a[2]); } static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) { vec768fp6 r; mul_fp6x2(r, a, b); redc_fp6x2(ret, r); /* narrow to normal width */ } static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) { vec768x s0, m01, m12, s2, rx; sqr_fp2x2(s0, a[0]); mul_fp2x2(m01, a[0], a[1]); add_fp2x2(m01, m01, m01); mul_fp2x2(m12, a[1], a[2]); add_fp2x2(m12, m12, m12); sqr_fp2x2(s2, a[2]); /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) = a1^2 + 2*(a0*a2) */ add_fp2(ret[2], a[2], a[1]); add_fp2(ret[2], ret[2], a[0]); sqr_fp2x2(rx, ret[2]); sub_fp2x2(rx, rx, s0); sub_fp2x2(rx, rx, s2); sub_fp2x2(rx, rx, m01); sub_fp2x2(rx, rx, m12); redc_fp2x2(ret[2], rx); /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ mul_by_u_plus_1_fp2x2(rx, m12); add_fp2x2(rx, rx, s0); redc_fp2x2(ret[0], rx); /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ mul_by_u_plus_1_fp2x2(rx, s2); add_fp2x2(rx, rx, m01); redc_fp2x2(ret[1], rx); } #else static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) { vec384x t0, t1, t2, t3, t4, t5; mul_fp2(t0, a[0], b[0]); mul_fp2(t1, a[1], b[1]); mul_fp2(t2, a[2], b[2]); /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ add_fp2(t4, a[1], a[2]); add_fp2(t5, b[1], b[2]); mul_fp2(t3, t4, t5); sub_fp2(t3, t3, t1); sub_fp2(t3, t3, t2); mul_by_u_plus_1_fp2(t3, t3); /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) = a0*b1 + a1*b0 + a2*b2*(u+1) */ add_fp2(t4, a[0], a[1]); add_fp2(t5, b[0], b[1]); mul_fp2(ret[1], t4, t5); sub_fp2(ret[1], ret[1], t0); sub_fp2(ret[1], ret[1], t1); mul_by_u_plus_1_fp2(t4, t2); add_fp2(ret[1], ret[1], t4); /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 = a0*b2 + a2*b0 + a1*b1 */ add_fp2(t4, a[0], a[2]); add_fp2(t5, b[0], b[2]); mul_fp2(ret[2], t4, t5); sub_fp2(ret[2], ret[2], t0); sub_fp2(ret[2], ret[2], t2); add_fp2(ret[2], ret[2], t1); add_fp2(ret[0], t3, t0); /* ... moved from above */ } static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) { vec384x s0, m01, m12, s2; sqr_fp2(s0, a[0]); mul_fp2(m01, a[0], a[1]); add_fp2(m01, m01, m01); mul_fp2(m12, a[1], a[2]); add_fp2(m12, m12, m12); sqr_fp2(s2, a[2]); /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) = a1^2 + 2*(a0*a2) */ add_fp2(ret[2], a[2], a[1]); add_fp2(ret[2], ret[2], a[0]); sqr_fp2(ret[2], ret[2]); sub_fp2(ret[2], ret[2], s0); sub_fp2(ret[2], ret[2], s2); sub_fp2(ret[2], ret[2], m01); sub_fp2(ret[2], ret[2], m12); /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ mul_by_u_plus_1_fp2(ret[0], m12); add_fp2(ret[0], ret[0], s0); /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ mul_by_u_plus_1_fp2(ret[1], s2); add_fp2(ret[1], ret[1], m01); } #endif static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) { add_fp2(ret[0], a[0], b[0]); add_fp2(ret[1], a[1], b[1]); add_fp2(ret[2], a[2], b[2]); } static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) { sub_fp2(ret[0], a[0], b[0]); sub_fp2(ret[1], a[1], b[1]); sub_fp2(ret[2], a[2], b[2]); } static void neg_fp6(vec384fp6 ret, const vec384fp6 a) { neg_fp2(ret[0], a[0]); neg_fp2(ret[1], a[1]); neg_fp2(ret[2], a[2]); } #if 0 #define mul_by_v_fp6 mul_by_v_fp6 static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) { vec384x t; mul_by_u_plus_1_fp2(t, a[2]); vec_copy(ret[2], a[1], sizeof(a[1])); vec_copy(ret[1], a[0], sizeof(a[0])); vec_copy(ret[0], t, sizeof(t)); } #endif /* * Fp12 extension */ #if defined(__FP2x2__) static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) { vec768fp6 t0, t1, rx; vec384fp6 t2; mul_fp6x2(t0, a[0], b[0]); mul_fp6x2(t1, a[1], b[1]); /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 */ add_fp6(t2, a[0], a[1]); add_fp6(ret[1], b[0], b[1]); mul_fp6x2(rx, ret[1], t2); sub_fp6x2(rx, rx, t0); sub_fp6x2(rx, rx, t1); redc_fp6x2(ret[1], rx); /* ret[0] = a0*b0 + a1*b1*v */ mul_by_u_plus_1_fp2x2(rx[0], t1[2]); add_fp2x2(rx[0], t0[0], rx[0]); add_fp2x2(rx[1], t0[1], t1[0]); add_fp2x2(rx[2], t0[2], t1[1]); redc_fp6x2(ret[0], rx); } static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp2 b) { mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ mul_by_u_plus_1_fp2x2(ret[0], ret[1]); mul_fp2x2(ret[1], a[0], b); mul_fp2x2(ret[2], a[1], b); } static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) { vec768x t0, t1; vec384x aa, bb; mul_fp2x2(t0, a[0], b[0]); mul_fp2x2(t1, a[1], b[1]); /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 = (a1*0 + a2*b1)*(u+1) + a0*b0 */ mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ mul_by_u_plus_1_fp2x2(ret[0], ret[1]); add_fp2x2(ret[0], ret[0], t0); /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) = a0*b1 + a1*b0 + a2*0*(u+1) */ add_fp2(aa, a[0], a[1]); add_fp2(bb, b[0], b[1]); mul_fp2x2(ret[1], aa, bb); sub_fp2x2(ret[1], ret[1], t0); sub_fp2x2(ret[1], ret[1], t1); /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 = a0*0 + a2*b0 + a1*b1 */ mul_fp2x2(ret[2], a[2], b[0]); add_fp2x2(ret[2], ret[2], t1); } static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp6 xy00z0) { vec768fp6 t0, t1, rr; vec384fp6 t2; mul_by_xy0_fp6x2(t0, a[0], xy00z0); mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 */ vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); add_fp2(t2[1], xy00z0[1], xy00z0[2]); add_fp6(ret[1], a[0], a[1]); mul_by_xy0_fp6x2(rr, ret[1], t2); sub_fp6x2(rr, rr, t0); sub_fp6x2(rr, rr, t1); redc_fp6x2(ret[1], rr); /* ret[0] = a0*b0 + a1*b1*v */ mul_by_u_plus_1_fp2x2(rr[0], t1[2]); add_fp2x2(rr[0], t0[0], rr[0]); add_fp2x2(rr[1], t0[1], t1[0]); add_fp2x2(rr[2], t0[2], t1[1]); redc_fp6x2(ret[0], rr); } #else static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) { vec384fp6 t0, t1, t2; mul_fp6(t0, a[0], b[0]); mul_fp6(t1, a[1], b[1]); /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 */ add_fp6(t2, a[0], a[1]); add_fp6(ret[1], b[0], b[1]); mul_fp6(ret[1], ret[1], t2); sub_fp6(ret[1], ret[1], t0); sub_fp6(ret[1], ret[1], t1); /* ret[0] = a0*b0 + a1*b1*v */ #ifdef mul_by_v_fp6 mul_by_v_fp6(t1, t1); add_fp6(ret[0], t0, t1); #else mul_by_u_plus_1_fp2(t1[2], t1[2]); add_fp2(ret[0][0], t0[0], t1[2]); add_fp2(ret[0][1], t0[1], t1[0]); add_fp2(ret[0][2], t0[2], t1[1]); #endif } static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp2 b) { vec384x t; mul_fp2(t, a[2], b); mul_fp2(ret[2], a[1], b); mul_fp2(ret[1], a[0], b); mul_by_u_plus_1_fp2(ret[0], t); } static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) { vec384x t0, t1, /*t2,*/ t3, t4, t5; mul_fp2(t0, a[0], b[0]); mul_fp2(t1, a[1], b[1]); /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 = (a1*0 + a2*b1)*(u+1) + a0*b0 */ mul_fp2(t3, a[2], b[1]); mul_by_u_plus_1_fp2(t3, t3); /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) = a0*b1 + a1*b0 + a2*0*(u+1) */ add_fp2(t4, a[0], a[1]); add_fp2(t5, b[0], b[1]); mul_fp2(ret[1], t4, t5); sub_fp2(ret[1], ret[1], t0); sub_fp2(ret[1], ret[1], t1); /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 = a0*0 + a2*b0 + a1*b1 */ mul_fp2(ret[2], a[2], b[0]); add_fp2(ret[2], ret[2], t1); add_fp2(ret[0], t3, t0); /* ... moved from above */ } static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp6 xy00z0) { vec384fp6 t0, t1, t2; mul_by_xy0_fp6(t0, a[0], xy00z0); mul_by_0y0_fp6(t1, a[1], xy00z0[2]); /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 = a0*b1 + a1*b0 */ vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); add_fp2(t2[1], xy00z0[1], xy00z0[2]); add_fp6(ret[1], a[0], a[1]); mul_by_xy0_fp6(ret[1], ret[1], t2); sub_fp6(ret[1], ret[1], t0); sub_fp6(ret[1], ret[1], t1); /* ret[0] = a0*b0 + a1*b1*v */ #ifdef mul_by_v_fp6 mul_by_v_fp6(t1, t1); add_fp6(ret[0], t0, t1); #else mul_by_u_plus_1_fp2(t1[2], t1[2]); add_fp2(ret[0][0], t0[0], t1[2]); add_fp2(ret[0][1], t0[1], t1[0]); add_fp2(ret[0][2], t0[2], t1[1]); #endif } #endif static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) { vec384fp6 t0, t1; add_fp6(t0, a[0], a[1]); #ifdef mul_by_v_fp6 mul_by_v_fp6(t1, a[1]); add_fp6(t1, a[0], t1); #else mul_by_u_plus_1_fp2(t1[2], a[1][2]); add_fp2(t1[0], a[0][0], t1[2]); add_fp2(t1[1], a[0][1], a[1][0]); add_fp2(t1[2], a[0][2], a[1][1]); #endif mul_fp6(t0, t0, t1); mul_fp6(t1, a[0], a[1]); /* ret[1] = 2*(a0*a1) */ add_fp6(ret[1], t1, t1); /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v = a0^2 + a1^2*v */ sub_fp6(ret[0], t0, t1); #ifdef mul_by_v_fp6 mul_by_v_fp6(t1, t1); sub_fp6(ret[0], ret[0], t1); #else mul_by_u_plus_1_fp2(t1[2], t1[2]); sub_fp2(ret[0][0], ret[0][0], t1[2]); sub_fp2(ret[0][1], ret[0][1], t1[0]); sub_fp2(ret[0][2], ret[0][2], t1[1]); #endif } static void conjugate_fp12(vec384fp12 a) { neg_fp6(a[1], a[1]); } static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) { vec384x c0, c1, c2, t0, t1; /* c0 = a0^2 - (a1*a2)*(u+1) */ sqr_fp2(c0, a[0]); mul_fp2(t0, a[1], a[2]); mul_by_u_plus_1_fp2(t0, t0); sub_fp2(c0, c0, t0); /* c1 = a2^2*(u+1) - (a0*a1) */ sqr_fp2(c1, a[2]); mul_by_u_plus_1_fp2(c1, c1); mul_fp2(t0, a[0], a[1]); sub_fp2(c1, c1, t0); /* c2 = a1^2 - a0*a2 */ sqr_fp2(c2, a[1]); mul_fp2(t0, a[0], a[2]); sub_fp2(c2, c2, t0); /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ mul_fp2(t0, c1, a[2]); mul_fp2(t1, c2, a[1]); add_fp2(t0, t0, t1); mul_by_u_plus_1_fp2(t0, t0); mul_fp2(t1, c0, a[0]); add_fp2(t0, t0, t1); reciprocal_fp2(t1, t0); mul_fp2(ret[0], c0, t1); mul_fp2(ret[1], c1, t1); mul_fp2(ret[2], c2, t1); } static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) { vec384fp6 t0, t1; sqr_fp6(t0, a[0]); sqr_fp6(t1, a[1]); #ifdef mul_by_v_fp6 mul_by_v_fp6(t1, t1); sub_fp6(t0, t0, t1); #else mul_by_u_plus_1_fp2(t1[2], t1[2]); sub_fp2(t0[0], t0[0], t1[2]); sub_fp2(t0[1], t0[1], t1[0]); sub_fp2(t0[2], t0[2], t1[1]); #endif inverse_fp6(t1, t0); mul_fp6(ret[0], a[0], t1); mul_fp6(ret[1], a[1], t1); neg_fp6(ret[1], ret[1]); } typedef vec384x vec384fp4[2]; #if defined(__FP2x2__) static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) { vec768x t0, t1, t2; sqr_fp2x2(t0, a0); sqr_fp2x2(t1, a1); add_fp2(ret[1], a0, a1); mul_by_u_plus_1_fp2x2(t2, t1); add_fp2x2(t2, t2, t0); redc_fp2x2(ret[0], t2); sqr_fp2x2(t2, ret[1]); sub_fp2x2(t2, t2, t0); sub_fp2x2(t2, t2, t1); redc_fp2x2(ret[1], t2); } #else static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) { vec384x t0, t1; sqr_fp2(t0, a0); sqr_fp2(t1, a1); add_fp2(ret[1], a0, a1); mul_by_u_plus_1_fp2(ret[0], t1); add_fp2(ret[0], ret[0], t0); sqr_fp2(ret[1], ret[1]); sub_fp2(ret[1], ret[1], t0); sub_fp2(ret[1], ret[1], t1); } #endif static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) { vec384fp4 t0, t1, t2; sqr_fp4(t0, a[0][0], a[1][1]); sqr_fp4(t1, a[1][0], a[0][2]); sqr_fp4(t2, a[0][1], a[1][2]); sub_fp2(ret[0][0], t0[0], a[0][0]); add_fp2(ret[0][0], ret[0][0], ret[0][0]); add_fp2(ret[0][0], ret[0][0], t0[0]); sub_fp2(ret[0][1], t1[0], a[0][1]); add_fp2(ret[0][1], ret[0][1], ret[0][1]); add_fp2(ret[0][1], ret[0][1], t1[0]); sub_fp2(ret[0][2], t2[0], a[0][2]); add_fp2(ret[0][2], ret[0][2], ret[0][2]); add_fp2(ret[0][2], ret[0][2], t2[0]); mul_by_u_plus_1_fp2(t2[1], t2[1]); add_fp2(ret[1][0], t2[1], a[1][0]); add_fp2(ret[1][0], ret[1][0], ret[1][0]); add_fp2(ret[1][0], ret[1][0], t2[1]); add_fp2(ret[1][1], t0[1], a[1][1]); add_fp2(ret[1][1], ret[1][1], ret[1][1]); add_fp2(ret[1][1], ret[1][1], t0[1]); add_fp2(ret[1][2], t1[1], a[1][2]); add_fp2(ret[1][2], ret[1][2], ret[1][2]); add_fp2(ret[1][2], ret[1][2], t1[1]); } /* * caveat lector! |n| has to be non-zero and not more than 3! */ static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) { vec_copy(ret[0], a[0], sizeof(ret[0])); cneg_fp(ret[1], a[1], n & 1); } static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) { static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ { { 0 }, { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, { { 0 }, { ONE_MONT_P } } }; static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } }; frobenius_map_fp2(ret[0], a[0], n); frobenius_map_fp2(ret[1], a[1], n); frobenius_map_fp2(ret[2], a[2], n); --n; /* implied ONE_MONT_P at index 0 */ mul_fp2(ret[1], ret[1], coeffs1[n]); mul_fp(ret[2][0], ret[2][0], coeffs2[n]); mul_fp(ret[2][1], ret[2][1], coeffs2[n]); } static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) { static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, }; frobenius_map_fp6(ret[0], a[0], n); frobenius_map_fp6(ret[1], a[1], n); --n; /* implied ONE_MONT_P at index 0 */ mul_fp2(ret[1][0], ret[1][0], coeffs[n]); mul_fp2(ret[1][1], ret[1][1], coeffs[n]); mul_fp2(ret[1][2], ret[1][2], coeffs[n]); } /* * BLS12-381-specific Fp12 shortcuts. */ void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) { sqr_fp12(ret, a); } void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) { cyclotomic_sqr_fp12(ret, a); } void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) { mul_fp12(ret, a, b); } void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, const vec384fp6 xy00z0) { mul_by_xy00z0_fp12(ret, a, xy00z0); } void blst_fp12_conjugate(vec384fp12 a) { conjugate_fp12(a); } void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) { inverse_fp12(ret, a); } /* caveat lector! |n| has to be non-zero and not more than 3! */ void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) { frobenius_map_fp12(ret, a, n); } int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) { return (int)vec_is_equal(a, b, sizeof(vec384fp12)); } int blst_fp12_is_one(const vec384fp12 a) { return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0]))); } const vec384fp12 *blst_fp12_one(void) { return (const vec384fp12 *)BLS12_381_Rx.p12; } void blst_bendian_from_fp12(unsigned char ret[48*12], const vec384fp12 a) { size_t i, j; vec384 out; for (i = 0; i < 3; i++) { for (j = 0; j < 2; j++) { from_fp(out, a[j][i][0]); be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; from_fp(out, a[j][i][1]); be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; } } } size_t blst_fp12_sizeof(void) { return sizeof(vec384fp12); } ================================================ FILE: src/hash_to_field.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "consts.h" #include "sha256.h" static const vec384 BLS12_381_RRRR = { /* RR^2 */ TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) }; #ifdef expand_message_xmd void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, const unsigned char *aug, size_t aug_len, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len); #else static void sha256_init_Zpad(SHA256_CTX *ctx) { ctx->h[0] = 0xda5698beU; ctx->h[1] = 0x17b9b469U; ctx->h[2] = 0x62335799U; ctx->h[3] = 0x779fbecaU; ctx->h[4] = 0x8ce5d491U; ctx->h[5] = 0xc0d26243U; ctx->h[6] = 0xbafef9eaU; ctx->h[7] = 0x1837a9d8U; ctx->N = 64; vec_zero(ctx->buf, sizeof(ctx->buf)); ctx->off = 0; } static void vec_xor(void *restrict ret, const void *restrict a, const void *restrict b, size_t num) { limb_t *rp = (limb_t *)ret; const limb_t *ap = (const limb_t *)a; const limb_t *bp = (const limb_t *)b; size_t i; num /= sizeof(limb_t); for (i = 0; i < num; i++) rp[i] = ap[i] ^ bp[i]; } static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, const unsigned char *aug, size_t aug_len, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len) { union { limb_t align; unsigned char c[32]; } b_0; union { limb_t align; unsigned char c[33+256+31]; } b_i; unsigned char *p; size_t i, b_i_bits, b_i_blocks; SHA256_CTX ctx; /* * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' */ if (DST_len > 255) { sha256_init(&ctx); sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17); sha256_update(&ctx, DST, DST_len); sha256_final(b_0.c, &ctx); DST = b_0.c, DST_len = 32; } b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; vec_zero(b_i.c + b_i_blocks - 64, 64); p = b_i.c + 33; for (i = 0; i < DST_len; i++) p[i] = DST[i]; p[i++] = (unsigned char)DST_len; p[i++] = 0x80; p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0; b_i_bits = (33 + DST_len + 1) * 8; p = b_i.c + b_i_blocks; p[-2] = (unsigned char)(b_i_bits >> 8); p[-1] = (unsigned char)(b_i_bits); sha256_init_Zpad(&ctx); /* Z_pad | */ sha256_update(&ctx, aug, aug_len); /* | aug | */ sha256_update(&ctx, msg, msg_len); /* | msg | */ /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ b_i.c[30] = (unsigned char)(len_in_bytes >> 8); b_i.c[31] = (unsigned char)(len_in_bytes); b_i.c[32] = 0; sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); sha256_final(b_0.c, &ctx); sha256_init_h(ctx.h); vec_copy(b_i.c, b_0.c, 32); ++b_i.c[32]; sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); sha256_emit(bytes, ctx.h); len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */ len_in_bytes /= 32; /* caller being responsible for accordingly large * buffer. hash_to_field passes one with length * divisible by 64, remember? which works... */ while (--len_in_bytes) { sha256_init_h(ctx.h); vec_xor(b_i.c, b_0.c, bytes, 32); bytes += 32; ++b_i.c[32]; sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); sha256_emit(bytes, ctx.h); } } #endif /* * |nelems| is 'count * m' from spec */ static void hash_to_field(vec384 elems[], size_t nelems, const unsigned char *aug, size_t aug_len, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len) { size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ #if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ || defined(__STDC_NO_VLA__) limb_t *pseudo_random = alloca(len_in_bytes); #else limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; #endif unsigned char *bytes; vec768 elem; aug_len = aug!=NULL ? aug_len : 0; DST_len = DST!=NULL ? DST_len : 0; expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, aug, aug_len, msg, msg_len, DST, DST_len); vec_zero(elem, sizeof(elem)); bytes = (unsigned char *)pseudo_random; while (nelems--) { limbs_from_be_bytes(elem, bytes, L); bytes += L; /* * L-bytes block % P, output is in Montgomery domain... */ redc_mont_384(elems[0], elem, BLS12_381_P, p0); mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); elems++; } } void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len) { size_t buf_len = (len_in_bytes+31) & ((size_t)0-32); unsigned char *buf_ptr = bytes; if (buf_len > 255*32) return; if (buf_len != len_in_bytes) buf_ptr = alloca(buf_len); expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len, DST, DST_len); if (buf_ptr != bytes) { unsigned char *ptr = buf_ptr; while (len_in_bytes--) *bytes++ = *ptr++; vec_zero(buf_ptr, buf_len); } } ================================================ FILE: src/keygen.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "consts.h" #include "bytes.h" #include "sha256.h" typedef struct { SHA256_CTX ctx; unsigned int h_ipad[8]; unsigned int h_opad[8]; union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; } HMAC_SHA256_CTX; static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) { size_t i; if (K == NULL) { /* reuse h_ipad and h_opad */ sha256_hcopy(ctx->ctx.h, ctx->h_ipad); ctx->ctx.N = 64; vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); ctx->ctx.off = 0; return; } vec_zero(ctx->tail.c, sizeof(ctx->tail)); if (K_len > 64) { sha256_init(&ctx->ctx); sha256_update(&ctx->ctx, K, K_len); sha256_final(ctx->tail.c, &ctx->ctx); } else if (K_len != 0) { sha256_bcopy(ctx->tail.c, K, K_len); } for (i = 0; i < 64/sizeof(limb_t); i++) ctx->tail.l[i] ^= (limb_t)0x3636363636363636; sha256_init(&ctx->ctx); sha256_update(&ctx->ctx, ctx->tail.c, 64); sha256_hcopy(ctx->h_ipad, ctx->ctx.h); for (i = 0; i < 64/sizeof(limb_t); i++) ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); sha256_init_h(ctx->h_opad); sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); vec_zero(ctx->tail.c, sizeof(ctx->tail)); ctx->tail.c[32] = 0x80; ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ ctx->tail.c[63] = 0; } static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, size_t len) { sha256_update(&ctx->ctx, inp, len); } static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) { sha256_final(ctx->tail.c, &ctx->ctx); sha256_hcopy(ctx->ctx.h, ctx->h_opad); sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); sha256_emit(md, ctx->ctx.h); } static void HKDF_Extract(unsigned char PRK[32], const void *salt, size_t salt_len, const void *IKM, size_t IKM_len, #ifndef __BLST_HKDF_TESTMODE__ int IKM_fixup, #endif HMAC_SHA256_CTX *ctx) { unsigned char zero[1] = { 0 }; HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); HMAC_update(ctx, IKM, IKM_len); #ifndef __BLST_HKDF_TESTMODE__ if (IKM_fixup) { /* Section 2.3 KeyGen in BLS-signature draft */ HMAC_update(ctx, zero, 1); } #endif HMAC_final(PRK, ctx); } static void HKDF_Expand(unsigned char *OKM, size_t L, const unsigned char PRK[32], const void *info, size_t info_len, #ifndef __BLST_HKDF_TESTMODE__ int info_fixup, #endif HMAC_SHA256_CTX *ctx) { #if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ || defined(__STDC_NO_VLA__) unsigned char *info_prime = alloca(info_len + 2 + 1); #else unsigned char info_prime[info_len + 2 + 1]; #endif HMAC_init(ctx, PRK, 32); if (info_len != 0) sha256_bcopy(info_prime, info, info_len); #ifndef __BLST_HKDF_TESTMODE__ if (info_fixup) { /* Section 2.3 KeyGen in BLS-signature draft */ info_prime[info_len + 0] = (unsigned char)(L >> 8); info_prime[info_len + 1] = (unsigned char)(L); info_len += 2; } #endif info_prime[info_len] = 1; /* counter */ HMAC_update(ctx, info_prime, info_len + 1); HMAC_final(ctx->tail.c, ctx); while (L > 32) { sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); OKM += 32; L -= 32; ++info_prime[info_len]; /* counter */ HMAC_init(ctx, NULL, 0); HMAC_update(ctx, ctx->tail.c, 32); HMAC_update(ctx, info_prime, info_len + 1); HMAC_final(ctx->tail.c, ctx); } sha256_bcopy(OKM, ctx->tail.c, L); } #ifndef __BLST_HKDF_TESTMODE__ static void keygen(pow256 SK, const void *IKM, size_t IKM_len, const void *salt, size_t salt_len, const void *info, size_t info_len, int version) { struct { HMAC_SHA256_CTX ctx; unsigned char PRK[32], OKM[48]; vec512 key; } scratch; unsigned char salt_prime[32] = "BLS-SIG-KEYGEN-SALT-"; if (IKM_len < 32 || (version > 4 && salt == NULL)) { vec_zero(SK, sizeof(pow256)); return; } /* * Vet |info| since some callers were caught to be sloppy, e.g. * SWIG-4.0-generated Python wrapper... */ info_len = info==NULL ? 0 : info_len; if (salt == NULL) { salt = salt_prime; salt_len = 20; } if (version == 4) { /* salt = H(salt) */ sha256_init(&scratch.ctx.ctx); sha256_update(&scratch.ctx.ctx, salt, salt_len); sha256_final(salt_prime, &scratch.ctx.ctx); salt = salt_prime; salt_len = sizeof(salt_prime); } while (1) { /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */ HKDF_Extract(scratch.PRK, salt, salt_len, IKM, IKM_len, 1, &scratch.ctx); /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, info, info_len, 1, &scratch.ctx); /* SK = OS2IP(OKM) mod r */ vec_zero(scratch.key, sizeof(scratch.key)); limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0); /* * Given that mul_mont_sparse_256 has special boundary conditions * it's appropriate to mention that redc_mont_256 output is fully * reduced at this point. Because we started with 384-bit input, * one with most significant half smaller than the modulus. */ mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR, BLS12_381_r, r0); if (version < 4 || !vec_is_zero(scratch.key, sizeof(vec256))) break; /* salt = H(salt) */ sha256_init(&scratch.ctx.ctx); sha256_update(&scratch.ctx.ctx, salt, salt_len); sha256_final(salt_prime, &scratch.ctx.ctx); salt = salt_prime; salt_len = sizeof(salt_prime); } le_bytes_from_limbs(SK, scratch.key, sizeof(pow256)); /* * scrub the stack just in case next callee inadvertently flashes * a fragment across application boundary... */ vec_zero(&scratch, sizeof(scratch)); } void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len, const void *info, size_t info_len) { keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 4); } void blst_keygen_v3(pow256 SK, const void *IKM, size_t IKM_len, const void *info, size_t info_len) { keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 3); } void blst_keygen_v4_5(pow256 SK, const void *IKM, size_t IKM_len, const void *salt, size_t salt_len, const void *info, size_t info_len) { keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 4); } void blst_keygen_v5(pow256 SK, const void *IKM, size_t IKM_len, const void *salt, size_t salt_len, const void *info, size_t info_len) { keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 5); } /* * https://eips.ethereum.org/EIPS/eip-2333 */ void blst_derive_master_eip2333(pow256 SK, const void *seed, size_t seed_len) { keygen(SK, seed, seed_len, NULL, 0, NULL, 0, 4); } static void parent_SK_to_lamport_PK(pow256 PK, const pow256 parent_SK, unsigned int index) { size_t i; struct { HMAC_SHA256_CTX ctx; SHA256_CTX ret; unsigned char PRK[32], IKM[32]; unsigned char lamport[255][32]; } scratch; /* salt = I2OSP(index, 4) */ unsigned char salt[4] = { (unsigned char)(index>>24), (unsigned char)(index>>16), (unsigned char)(index>>8), (unsigned char)(index) }; /* IKM = I2OSP(parent_SK, 32) */ for (i = 0; i < 32; i++) scratch.IKM[i] = parent_SK[31-i]; /* lamport_0 = IKM_to_lamport_SK(IKM, salt) */ HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, &scratch.ctx); HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), scratch.PRK, NULL, 0, 0, &scratch.ctx); vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); scratch.ctx.ctx.buf[32] = 0x80; scratch.ctx.ctx.buf[62] = 1; /* 32*8 in big endian */ scratch.ctx.ctx.buf[63] = 0; for (i = 0; i < 255; i++) { /* lamport_PK = lamport_PK | SHA256(lamport_0[i]) */ sha256_init_h(scratch.ctx.ctx.h); sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); } /* compressed_lamport_PK = SHA256(lamport_PK) */ sha256_init(&scratch.ret); sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); /* not_IKM = flip_bits(IKM) */ for (i = 0; i< 32; i++) scratch.IKM[i] = ~scratch.IKM[i]; /* lamport_1 = IKM_to_lamport_SK(not_IKM, salt) */ HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, &scratch.ctx); HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), scratch.PRK, NULL, 0, 0, &scratch.ctx); vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); scratch.ctx.ctx.buf[32] = 0x80; scratch.ctx.ctx.buf[62] = 1; for (i = 0; i < 255; i++) { /* lamport_PK = lamport_PK | SHA256(lamport_1[i]) */ sha256_init_h(scratch.ctx.ctx.h); sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); } /* compressed_lamport_PK = SHA256(lamport_PK) */ sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); sha256_final(PK, &scratch.ret); /* * scrub the stack just in case next callee inadvertently flashes * a fragment across application boundary... */ vec_zero(&scratch, sizeof(scratch)); } void blst_derive_child_eip2333(pow256 SK, const pow256 parent_SK, unsigned int child_index) { parent_SK_to_lamport_PK(SK, parent_SK, child_index); keygen(SK, SK, sizeof(pow256), NULL, 0, NULL, 0, 4); } #endif ================================================ FILE: src/map_to_g1.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "point.h" #include "fields.h" /* * y^2 = x^3 + A'*x + B', isogenous one */ static const vec384 Aprime_E1 = { /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) }; static const vec384 Bprime_E1 = { /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) }; static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], const vec384 Zz_powers[], size_t n) { while (n--) mul_fp(map[n], isogeny_map[n], Zz_powers[n]); } static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) { while (n--) { mul_fp(acc, acc, x); add_fp(acc, acc, map[n]); } } static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) { /* * x = x_num / x_den, where * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + * ... + k_(1,0) * ... */ static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } }; /* ... * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) */ static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } }; /* * y = y' * y_num / y_den, where * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + * ... + k_(3,0) * ... */ static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } }; /* ... * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) */ static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } }; vec384 Zz_powers[15], map[15], xn, xd, yn, yd; /* lay down Z^2 powers in descending order */ sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ #ifdef __OPTIMIZE_SIZE__ for (size_t i = 14; i > 0; i--) mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); #else sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ #endif map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); mul_fp(xn, p->X, isogeny_map_x_num[11]); add_fp(xn, xn, map[10]); map_fp(xn, p->X, map, 10); map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); add_fp(xd, p->X, map[9]); map_fp(xd, p->X, map, 9); mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); mul_fp(yn, p->X, isogeny_map_y_num[15]); add_fp(yn, yn, map[14]); map_fp(yn, p->X, map, 14); mul_fp(yn, yn, p->Y); /* yn *= Y */ map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); add_fp(yd, p->X, map[14]); map_fp(yd, p->X, map, 14); mul_fp(Zz_powers[14], Zz_powers[14], p->Z); mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ /* convert (xn, xd, yn, yd) to Jacobian coordinates */ mul_fp(out->Z, xd, yd); /* Z = xd * yd */ mul_fp(out->X, xn, yd); mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ sqr_fp(out->Y, out->Z); mul_fp(out->Y, out->Y, xd); mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ } static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) { static const vec384 minus_A = { /* P - A */ TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) }; static const vec384 Z = { /* (11<<384) % P */ TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) }; static const vec384 sqrt_minus_ZZZ = { TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) }; static const vec384 ZxA = { TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) }; vec384 uu, tv2, x2n, gx1, gxd, y2; #if 0 vec384 xn, x1n, xd, y, y1, Zuu, tv4; #else # define xn p->X # define y p->Y # define xd p->Z # define x1n xn # define y1 y # define Zuu x2n # define tv4 y1 #endif #define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) bool_t e1, e2; /* * as per map_to_curve() from poc/sswu_opt.sage at * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve */ /* x numerator variants */ sqr_fp(uu, u); /* uu = u^2 */ mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ /* x denumenator */ mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ /* y numerators variants */ sqr_fp(tv2, xd); /* tv2 = xd^2 */ mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ mul_fp(y2, y2, uu); /* y2 = y2 * uu */ mul_fp(y2, y2, u); /* y2 = y2 * u */ /* choose numerators */ vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ e1 = sgn0_fp(u); e2 = sgn0_fp(y); cneg_fp(y, y, e1^e2); /* fix sign of y */ /* return (xn, xd, y, 1) */ /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ mul_fp(p->X, xn, xd); /* X = xn * xd */ mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ #ifndef xd vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ #else # undef xn # undef y # undef xd # undef x1n # undef y1 # undef Zuu # undef tv4 #endif #undef sgn0_fp } static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n) { POINTonE1_dadd(out, out, p, NULL); while(n--) POINTonE1_double(out, out); } static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in) { POINTonE1_double(out, in); /* 1: 0x2 */ POINTonE1_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ POINTonE1_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ POINTonE1_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ POINTonE1_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ POINTonE1_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ } /* * |u|, |v| are expected to be in Montgomery representation */ static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) { POINTonE1 p; map_to_isogenous_E1(&p, u); if (v != NULL) { map_to_isogenous_E1(out, v); /* borrow |out| */ POINTonE1_dadd(&p, &p, out, Aprime_E1); } isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ POINTonE1_times_minus_z(out, &p); POINTonE1_dadd(out, out, &p, NULL); } void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) { map_to_g1(out, u, v); } static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { vec384 u[1]; hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); map_to_g1(p, u[0], NULL); } void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { vec384 u[2]; hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); map_to_g1(p, u[0], u[1]); } void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } static void sigma(POINTonE1 *out, const POINTonE1 *in); #if 0 #ifdef __OPTIMIZE_SIZE__ static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, const POINTonE1 *in) { static const byte zz_minus_1_div_by_3[] = { TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156) }; size_t n = 126-1; const POINTonE1 *dblin = in; while(n--) { POINTonE1_double(out, dblin); dblin = out; if (is_bit_set(zz_minus_1_div_by_3, n)) POINTonE1_dadd(out, out, in, NULL); } } #else static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) { while(n--) POINTonE1_double(out, out); POINTonE1_dadd(out, out, p, NULL); } static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, const POINTonE1 *in) { POINTonE1 t3, t5, t7, t11, t85; POINTonE1_double(&t7, in); /* 2P */ POINTonE1_dadd(&t3, &t7, in, NULL); /* 3P */ POINTonE1_dadd(&t5, &t3, &t7, NULL); /* 5P */ POINTonE1_dadd(&t7, &t5, &t7, NULL); /* 7P */ POINTonE1_double(&t85, &t5); /* 10P */ POINTonE1_dadd(&t11, &t85, in, NULL); /* 11P */ POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ /* (-0xd201000000010000^2 - 1) / 3 */ POINTonE1_double(out, &t7); /* 0xe */ POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ } #endif static bool_t POINTonE1_in_G1(const POINTonE1 *P) { POINTonE1 t0, t1, t2; /* Bowe, S., "Faster subgroup checks for BLS12-381" */ sigma(&t0, P); /* σ(P) */ sigma(&t1, &t0); /* σ²(P) */ POINTonE1_double(&t0, &t0); /* 2σ(P) */ POINTonE1_dadd(&t2, &t1, P, NULL); /* P + σ²(P) */ POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ POINTonE1_dadd(&t2, &t2, &t0, NULL); /* 2σ(P) - P - σ²(P) */ POINTonE1_times_zz_minus_1_div_by_3( &t0, &t2); POINTonE1_cneg(&t1, 1); POINTonE1_dadd(&t0, &t0, &t1, NULL); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ /* - σ²(P) */ return vec_is_zero(t0.Z, sizeof(t0.Z)); } #else static bool_t POINTonE1_in_G1(const POINTonE1 *P) { POINTonE1 t0, t1; /* Scott, M., https://eprint.iacr.org/2021/1130 */ POINTonE1_times_minus_z(&t0, P); POINTonE1_times_minus_z(&t1, &t0); POINTonE1_cneg(&t1, 1); /* [-z²]P */ sigma(&t0, P); /* σ(P) */ sigma(&t0, &t0); /* σ²(P) */ return POINTonE1_is_equal(&t0, &t1); } #endif int blst_p1_in_g1(const POINTonE1 *p) { return (int)POINTonE1_in_G1(p); } int blst_p1_affine_in_g1(const POINTonE1_affine *p) { POINTonE1 P; vec_copy(P.X, p->X, 2*sizeof(P.X)); vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), vec_is_zero(p, sizeof(*p))); return (int)POINTonE1_in_G1(&P); } ================================================ FILE: src/map_to_g2.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "point.h" #include "fields.h" /* * y^2 = x^3 + A'*x + B', isogenous one */ static const vec384x Aprime_E2 = { /* 240*i */ { 0 }, { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } }; static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } }; static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], const vec384x Zz_powers[], size_t n) { while (n--) mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); } static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) { while (n--) { mul_fp2(acc, acc, x); add_fp2(acc, acc, map[n]); } } static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) { /* * x = x_num / x_den, where * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) * ... */ static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, {{ 0 }, { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, { 0 }} }; /* ... * x_den = x'^2 + k_(2,1) * x' + k_(2,0) */ static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ {{ 0 }, { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} }; /* * y = y' * y_num / y_den, where * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) * ... */ static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, {{ 0 }, { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, { 0 }} }; /* ... * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) */ static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, {{ 0 }, { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} }; vec384x Zz_powers[3], map[3], xn, xd, yn, yd; /* lay down Z^2 powers in descending order */ sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); mul_fp2(xn, p->X, isogeny_map_x_num[3]); add_fp2(xn, xn, map[2]); map_fp2(xn, p->X, map, 2); map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); add_fp2(xd, p->X, map[1]); map_fp2(xd, p->X, map, 1); mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); mul_fp2(yn, p->X, isogeny_map_y_num[3]); add_fp2(yn, yn, map[2]); map_fp2(yn, p->X, map, 2); mul_fp2(yn, yn, p->Y); /* yn *= Y */ map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); add_fp2(yd, p->X, map[2]); map_fp2(yd, p->X, map, 2); mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ /* convert (xn, xd, yn, yd) to Jacobian coordinates */ mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ mul_fp2(out->X, xn, yd); mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ sqr_fp2(out->Y, out->Z); mul_fp2(out->Y, out->Y, xd); mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ } static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) { static const vec384x minus_A = { { 0 }, { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } }; static const vec384x Z = { /* -2 - i */ { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } }; static const vec384x recip_ZZZ = { /* 1/(Z^3) */ { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } }; static const vec384x magic_ZZZ = { /* 1/Z^3 = a + b*i */ /* a^2 + b^2 */ { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374), TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7), TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) }, /* (a^2 + b^2)^((P-3)/4) */ { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19), TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff), TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) } }; static const vec384x ZxA = { /* 240 - 480*i */ { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } }; vec384x uu, tv2, tv4, x2n, gx1, gxd, y2; #if 0 vec384x xn, x1n, xd, y, y1, Zuu; #else # define xn p->X # define y p->Y # define xd p->Z # define x1n xn # define y1 y # define Zuu x2n #endif #define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) bool_t e1, e2; /* * as per map_to_curve() from poc/sswu_opt.sage at * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve * with 9mod16 twists... */ /* x numerator variants */ sqr_fp2(uu, u); /* uu = u^2 */ mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ /* x denumenator */ mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ /* y numerators variants */ sqr_fp2(tv2, xd); /* tv2 = xd^2 */ mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ e2 = recip_sqrt_fp2(y1, tv4, /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ recip_ZZZ, magic_ZZZ); mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ mul_fp2(y2, y1, uu); /* y2 = y1 * uu */ mul_fp2(y2, y2, u); /* y2 = y2 * u */ /* choose numerators */ vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ e1 = sgn0_fp2(u); e2 = sgn0_fp2(y); cneg_fp2(y, y, e1^e2); /* fix sign of y */ /* return (xn, xd, y, 1) */ /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ mul_fp2(p->X, xn, xd); /* X = xn * xd */ mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ #ifndef xd vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ #else # undef xn # undef y # undef xd # undef x1n # undef y1 # undef Zuu # undef tv4 #endif #undef sgn0_fp2 } #if 0 static const byte h_eff[] = { TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4), TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a), TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95), TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768), TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3) }; static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) { POINTonE2_mult_w5(out, p, h_eff, 636); } #else /* * As per suggestions in "7. Clearing the cofactor" at * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 */ static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) { POINTonE2_dadd(out, out, p, NULL); while(n--) POINTonE2_double(out, out); } static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) { POINTonE2_double(out, in); /* 1: 0x2 */ POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ } static void psi(POINTonE2 *out, const POINTonE2 *in); static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) { POINTonE2 t0, t1; /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ POINTonE2_double(out, p); /* out = 2P */ psi(out, out); /* out = Ψ(2P) */ psi(out, out); /* out = Ψ²(2P) */ vec_copy(&t0, p, sizeof(t0)); POINTonE2_cneg(&t0, 1); /* t0 = -P */ psi(&t1, &t0); /* t1 = -Ψ(P) */ POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P */ POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P) */ POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ POINTonE2_dadd(&t0, &t0, p, NULL); /* t0 = [-z + 1]P */ POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P) */ POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P */ /* + [z - 1]Ψ(P) */ /* + Ψ²(2P) */ } #endif /* * |u|, |v| are expected to be in Montgomery representation */ static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) { POINTonE2 p; map_to_isogenous_E2(&p, u); if (v != NULL) { map_to_isogenous_E2(out, v); /* borrow |out| */ POINTonE2_dadd(&p, &p, out, Aprime_E2); } isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ clear_cofactor(out, &p); } void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) { map_to_g2(out, u, v); } static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { vec384x u[1]; hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); map_to_g2(p, u[0], NULL); } void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { vec384x u[2]; hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); map_to_g2(p, u[0], u[1]); } void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, const unsigned char *DST, size_t DST_len, const unsigned char *aug, size_t aug_len) { Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } static bool_t POINTonE2_in_G2(const POINTonE2 *P) { #if 0 POINTonE2 t0, t1, t2; /* Bowe, S., "Faster subgroup checks for BLS12-381" */ psi(&t0, P); /* Ψ(P) */ psi(&t0, &t0); /* Ψ²(P) */ psi(&t1, &t0); /* Ψ³(P) */ POINTonE2_times_minus_z(&t2, &t1); POINTonE2_dadd(&t0, &t0, &t2, NULL); POINTonE2_cneg(&t0, 1); POINTonE2_dadd(&t0, &t0, P, NULL); /* [z]Ψ³(P) - Ψ²(P) + P */ return vec_is_zero(t0.Z, sizeof(t0.Z)); #else POINTonE2 t0, t1; /* Scott, M., https://eprint.iacr.org/2021/1130 */ psi(&t0, P); /* Ψ(P) */ POINTonE2_times_minus_z(&t1, P); POINTonE2_cneg(&t1, 1); /* [z]P */ return POINTonE2_is_equal(&t0, &t1); #endif } int blst_p2_in_g2(const POINTonE2 *p) { return (int)POINTonE2_in_G2(p); } int blst_p2_affine_in_g2(const POINTonE2_affine *p) { POINTonE2 P; vec_copy(P.X, p->X, 2*sizeof(P.X)); vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), vec_is_zero(p, sizeof(*p))); return (int)POINTonE2_in_G2(&P); } ================================================ FILE: src/multi_scalar.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "fields.h" #include "point.h" #define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ static void ptype##s_to_affine(ptype##_affine dst[], \ const ptype *const points[], size_t npoints) \ { \ size_t i; \ vec##bits *acc, ZZ, ZZZ; \ const ptype *point = NULL; \ const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ \ while (npoints) { \ const ptype *p, *const *walkback; \ size_t delta = strideZ, sizeof(vec##bits), \ vec_is_zero(point->Z, sizeof(point->Z))); \ for (i = 1; i < delta; i++, acc++) { \ point = *points ? *points++ : point+1; \ vec_select(acc[0], BLS12_381_Rx.p, point->Z, sizeof(vec##bits), \ vec_is_zero(point->Z, sizeof(point->Z))); \ mul_##field(acc[0], acc[0], acc[-1]); \ } \ \ --acc; reciprocal_##field(acc[0], acc[0]); \ \ walkback = points-1, p = point, --delta, dst += delta; \ for (i = 0; i < delta; i++, acc--, dst--) { \ bool_t is_inf = vec_is_zero(p->Z, sizeof(p->Z)); \ mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ vec_select(acc[-1], BLS12_381_Rx.p, p->Z, sizeof(vec##bits), \ is_inf); \ mul_##field(acc[-1], acc[-1], acc[0]); \ mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ vec_czero(dst, sizeof(*dst), is_inf); \ p = (p == *walkback) ? *--walkback : p-1; \ } \ sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ vec_czero(dst, sizeof(*dst), vec_is_zero(p->Z, sizeof(p->Z))); \ ++delta, dst += delta, npoints -= delta; \ } \ } \ \ void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ size_t npoints) \ { ptype##s_to_affine(dst, points, npoints); } POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) /* * This is two-step multi-scalar multiplication procedure. First, given * a set of points you pre-compute a table for chosen windowing factor * [expressed in bits with value between 2 and 14], and then you pass * this table to the actual multiplication procedure along with scalars. * Idea is that the pre-computed table will be reused multiple times. In * which case multiplication runs faster than below Pippenger algorithm * implementation for up to ~16K points for wbits=8, naturally at the * expense of multi-megabyte table. One can trade even more memory for * performance, but each wbits increment doubles the memory requirement, * so at some point it gets prohibively large... For reference, without * reusing the table it's faster than Pippenger algorithm for up ~32 * points [with wbits=5]... */ #define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) /* The intermediate infinity points are encoded as [0, 0, 1]. */ #define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ static void ptype##_precompute_row(ptype row[], size_t n, \ const ptype##_affine *point) \ { \ size_t i, j; \ bool_t inf = vec_is_zero(point, sizeof(*point)); \ /* row[-1] is implicit infinity */\ vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ vec_select(&row[1].Z, one, &row[1].Z, sizeof(row[1].Z), inf); \ for (i = 2, j = 1; i < n; i += 2, j++) \ ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ ptype##_double(&row[i+1], &row[j]), /* row[3]=p*(2+2) */\ vec_select(&row[i+1].Z, one, &row[i+1].Z, sizeof(row[i+1].Z), inf); \ } /* row[4] ... */\ \ static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ size_t wbits, size_t npoints) \ { \ size_t total = npoints << (wbits-1); \ size_t nwin = (size_t)1 << (wbits-1); \ size_t i, j; \ vec##bits *acc, ZZ, ZZZ; \ \ src += total; \ acc = (vec##bits *)src; \ vec_copy(acc++, one, sizeof(vec##bits)); \ for (i = 0; i < npoints; i++) \ for (j = nwin; --src, --j; acc++) \ mul_##field(acc[0], acc[-1], src->Z); \ \ --acc; reciprocal_##field(acc[0], acc[0]); \ \ for (i = 0; i < npoints; i++) { \ vec_copy(dst++, src++, sizeof(ptype##_affine)); \ for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ mul_##field(acc[-1], src->Z, acc[0]); \ mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ } \ } \ } \ \ /* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ const ptype##_affine *const points[], \ size_t npoints) \ { \ size_t total = npoints << (wbits-1); \ size_t nwin = (size_t)1 << (wbits-1); \ size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ size_t i, top = 0; \ ptype *rows, *row; \ const ptype##_affine *point = NULL; \ size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ if (stride == 0) stride = 1; \ \ while (npoints >= nmin) { \ size_t limit = total - npoints; \ \ if (top + (stride << wbits) > limit) { \ stride = (limit - top) >> wbits; \ if (stride == 0) break; \ } \ rows = row = (ptype *)(&table[top]); \ for (i = 0; i < stride; i++, row += nwin) \ point = *points ? *points++ : point+1, \ ptype##_precompute_row(row, nwin, point); \ ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ top += stride << (wbits-1); \ npoints -= stride; \ } \ if ((i = 2*sizeof(ptype##_affine)*npoints*nwin) <= SCRATCH_LIMIT) { \ rows = row = alloca(i); \ for (i = 0; i < npoints; i++, row += nwin) \ point = *points ? *points++ : point+1, \ ptype##_precompute_row(row, nwin, point); \ ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ } else { \ const ptype *pp[2]; \ \ stride = SCRATCH_LIMIT / sizeof(ptype); \ stride -= stride % 2; \ if (stride > nwin) stride = nwin; \ \ pp[0] = row = alloca(stride * sizeof(ptype)); \ pp[1] = NULL; \ for (i = 0; i < npoints; i++, top += nwin) { \ size_t j, k, n; \ \ point = *points ? *points++ : point+1; \ ptype##_precompute_row(row, stride, point); \ ptype##s_to_affine(&table[top], pp, stride); \ for (j = stride; j < nwin; j += stride) { \ n = (j+stride) <= nwin ? stride : nwin-j; \ for (k = 0; k < n-1; k++) \ ptype##_add_affine(&row[k], &row[stride-1], &table[top+k]); \ if (j == stride) \ ptype##_double(&row[k], &row[stride-1]); \ else \ ptype##_add_affine(&row[k], &row[stride-1], &table[top+k]); \ ptype##s_to_affine(&table[top+j], pp, n); \ } \ } \ } \ } \ \ size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ { return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ const ptype##_affine *const points[], \ size_t npoints) \ { ptype##s_precompute_wbits(table, wbits, points, npoints); } #define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ size_t wbits, limb_t booth_idx) \ { \ bool_t booth_sign = (booth_idx >> wbits) & 1; \ bool_t idx_is_zero; \ static const ptype##_affine infinity = { 0 }; \ \ booth_idx &= ((limb_t)1 << wbits) - 1; \ idx_is_zero = is_zero(booth_idx); \ booth_idx -= 1 ^ idx_is_zero; \ vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ ptype##_cneg(p, booth_sign); \ } \ \ static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ size_t wbits, size_t npoints, \ const byte *const scalars[], size_t nbits, \ ptype scratch[]) \ { \ limb_t wmask, wval; \ size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ const byte *scalar, *const *scalar_s = scalars; \ const ptype##_affine *row = table; \ \ size_t scratch_sz = SCRATCH_SZ(ptype); \ if (scratch == NULL) { \ scratch_sz /= 4; /* limit to 288K */ \ scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ scratch = alloca(sizeof(ptype) * scratch_sz); \ } \ \ nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ scalar = *scalar_s++; \ \ /* top excess bits modulo target window size */ \ window = nbits % wbits; /* yes, it may be zero */ \ wmask = ((limb_t)1 << (window + 1)) - 1; \ \ nbits -= window; \ z = is_zero(nbits); \ wval = (get_wval_limb(scalar, nbits - (z^1), window + (z^1)) << z) & wmask; \ wval = booth_encode(wval, wbits); \ ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ row += nwin; \ \ i = 1; vec_zero(ret, sizeof(*ret)); \ while (nbits > 0) { \ for (j = i; i < npoints; i++, j++, row += nwin) { \ if (j == scratch_sz) \ ptype##s_accumulate(ret, scratch, j), j = 0; \ scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ wval = booth_encode(wval, wbits); \ ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ } \ ptype##s_accumulate(ret, scratch, j); \ \ for (j = 0; j < wbits; j++) \ ptype##_double(ret, ret); \ \ window = wbits; \ wmask = ((limb_t)1 << (window + 1)) - 1; \ nbits -= window; \ i = 0; row = table; scalar_s = scalars; \ } \ \ for (j = i; i < npoints; i++, j++, row += nwin) { \ if (j == scratch_sz) \ ptype##s_accumulate(ret, scratch, j), j = 0; \ scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ wval = (get_wval_limb(scalar, 0, window) << 1) & wmask; \ wval = booth_encode(wval, wbits); \ ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ } \ ptype##s_accumulate(ret, scratch, j); \ } \ \ size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ { \ const size_t scratch_sz = SCRATCH_SZ(ptype); \ return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ } \ void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ size_t wbits, size_t npoints, \ const byte *const scalars[], size_t nbits, \ ptype scratch[]) \ { ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) /* * Pippenger algorithm implementation, fastest option for larger amount * of points... */ static size_t pippenger_window_size(size_t npoints) { size_t wbits; for (wbits=0; npoints>>=1; wbits++) ; if (wbits > 12) return wbits - 3; else if (wbits > 8) return wbits - 2; else if (wbits > 4) return wbits - 1; return wbits ? 2 : 1; } #define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; #define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ size_t wbits) \ { \ ptype##xyzz ret[1], acc[1]; \ size_t n = (size_t)1 << wbits; \ \ /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ vec_copy(acc, &buckets[--n], sizeof(acc)); \ vec_copy(ret, &buckets[n], sizeof(ret)); \ vec_zero(&buckets[n], sizeof(buckets[n])); \ while (n--) { \ ptype##xyzz_dadd(acc, acc, &buckets[n]); \ ptype##xyzz_dadd(ret, ret, acc); \ vec_zero(&buckets[n], sizeof(buckets[n])); \ } \ ptype##xyzz_to_Jacobian(out, ret); \ } \ \ static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ size_t wbits, const ptype##_affine *p) \ { \ bool_t booth_sign = (booth_idx >> wbits) & 1; \ \ booth_idx &= (1< nbits) wbits = nbits - bit0, cbits = wbits + 1; \ else wbits = cbits = window; \ ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ bit0, wbits, cbits); \ } \ void prefix##s_mult_pippenger(ptype *ret, \ const ptype##_affine *const points[], \ size_t npoints, \ const byte *const scalars[], size_t nbits, \ ptype##xyzz scratch[]) \ { \ if (npoints == 1) { \ prefix##_from_affine(ret, points[0]); \ ptype##_mult_w5(ret, ret, scalars[0], nbits); \ return; \ } \ if ((npoints * sizeof(ptype##_affine) * 8 * 3) <= SCRATCH_LIMIT && \ npoints < 32) { \ ptype##_affine *table = alloca(npoints * sizeof(ptype##_affine) * 8); \ ptype##s_precompute_wbits(table, 4, points, npoints); \ ptype##s_mult_wbits(ret, table, 4, npoints, scalars, nbits, NULL); \ return; \ } \ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); \ } DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2) ================================================ FILE: src/no_asm.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #if LIMB_T_BITS==32 typedef unsigned long long llimb_t; #endif #if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 || defined(__STDC_NO_VLA__) # error "unsupported compiler" #endif #if defined(__clang__) # pragma GCC diagnostic ignored "-Wstatic-in-inline" #endif #if !defined(__clang__) && !defined(__builtin_assume) # if defined(__GNUC__) && __GNUC__>=5 # define __builtin_assume(condition) if (!(condition)) __builtin_unreachable() # elif defined(_MSC_VER) # define __builtin_assume(condition) __assume(condition) # else # define __builtin_assume(condition) (void)(condition) # endif #endif static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], const limb_t p[], limb_t n0, size_t n) { __builtin_assume(n != 0 && n%2 == 0); llimb_t limbx; limb_t mask, borrow, mx, hi, tmp[n+1], carry; size_t i, j; for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; tmp[i] = hi; for (carry=0, j=0; ; ) { limbx = (mx * (llimb_t)p[0]) + tmp[0]; hi = (limb_t)(limbx >> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } limbx = tmp[i] + (hi + (llimb_t)carry); tmp[i-1] = (limb_t)limbx; carry = (limb_t)(limbx >> LIMB_T_BITS); if (++j==n) break; for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; limbx = hi + (llimb_t)carry; tmp[i] = (limb_t)limbx; carry = (limb_t)(limbx >> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; launder(mask); for(i=0; i> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; launder(mask); for(i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; launder(mask); for (carry=0, i=0; i> LIMB_T_BITS); } } #define SUB_MOD_IMPL(bits) \ inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ const vec##bits b, const vec##bits p) \ { sub_mod_n(ret, a, b, p, NLIMBS(bits)); } SUB_MOD_IMPL(256) SUB_MOD_IMPL(384) static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], size_t n) { __builtin_assume(n != 0); llimb_t limbx; limb_t mask, carry, borrow, tmp[n], two_a[n]; size_t i; for (carry=0, i=0; i>(LIMB_T_BITS-1); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; launder(mask); for(i=0; i> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; launder(mask); for(i=0; i>(LIMB_T_BITS-1); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; launder(mask); for(i=0; i> LIMB_T_BITS) & 1; } flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; mask = (limb_t)0 - flag; for(i=0; i> LIMB_T_BITS) & 1; } return borrow & (is_zero(acc) ^ 1); } #define CHECK_MOD_IMPL(bits) \ inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ { return check_mod_n(a, p, NLIMBS(bits)); } CHECK_MOD_IMPL(256) static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], const limb_t p[], size_t n) { __builtin_assume(n != 0); limb_t ret_[n], a_[n], b_[n], zero; limbs_from_le_bytes(a_, a, sizeof(a_)); limbs_from_le_bytes(b_, b, sizeof(b_)); add_mod_n(ret_, a_, b_, p, n); zero = vec_is_zero(ret_, sizeof(ret_)); le_bytes_from_limbs(ret, ret_, sizeof(ret_)); return zero^1; } #define ADD_N_CHECK_MOD_IMPL(bits) \ inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ const pow##bits b, const vec##bits p) \ { return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } ADD_N_CHECK_MOD_IMPL(256) static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], const limb_t p[], size_t n) { __builtin_assume(n != 0); limb_t ret_[n], a_[n], b_[n], zero; limbs_from_le_bytes(a_, a, sizeof(a_)); limbs_from_le_bytes(b_, b, sizeof(b_)); sub_mod_n(ret_, a_, b_, p, n); zero = vec_is_zero(ret_, sizeof(ret_)); le_bytes_from_limbs(ret, ret_, sizeof(ret_)); return zero^1; } #define SUB_N_CHECK_MOD_IMPL(bits) \ inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ const pow##bits b, const vec##bits p) \ { return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } SUB_N_CHECK_MOD_IMPL(256) static void from_mont_n(limb_t ret[], const limb_t a[], const limb_t p[], limb_t n0, size_t n) { __builtin_assume(n != 0 && n%2 == 0); llimb_t limbx; limb_t mask, borrow, mx, hi, tmp[n]; size_t i, j; for (j=0; j> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } tmp[i-1] = hi; a = tmp; } /* this is needed only if input can be non-fully-reduced */ for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; launder(mask); for(i=0; i> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } tmp[i-1] = hi; b = tmp; } for (carry=0, i=0; i> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; launder(mask); for(i=0; i> LIMB_T_BITS); } for (next=ret[0], i=0; i> 1; next = ret[i+1]; ret[i] = limb | next << (LIMB_T_BITS-1); } ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); a = ret; } } #define RSHIFT_MOD_IMPL(bits) \ inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ const vec##bits p) \ { rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } RSHIFT_MOD_IMPL(256) RSHIFT_MOD_IMPL(384) #define DIV_BY_2_MOD_IMPL(bits) \ inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ const vec##bits p) \ { rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } DIV_BY_2_MOD_IMPL(384) static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) { __builtin_assume(n != 0); llimb_t limbx; limb_t carry, borrow, ret, tmp[n]; size_t i; ret = a[0] & 1; /* parity */ for (carry=0, i=0; i>(LIMB_T_BITS-1); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } ret |= ((carry - borrow) & 2) ^ 2; return ret; } inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) { return sgn0_pty_mod_n(a, p, NLIMBS(384)); } inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) { vec384 tmp; from_mont_n(tmp, a, p, n0, NLIMBS(384)); return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); } inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) { limb_t re, im, sign, prty; re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); sign = (re & sign) | (im & ~sign); /* a->re==0 ? prty(a->im) : prty(a->re) */ prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); prty = (im & prty) | (re & ~prty); return (sign & 2) | (prty & 1); } inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) { vec384x tmp; from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); return sgn0_pty_mod_384x(tmp, p); } void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 p, limb_t n0) { vec384 aa, bb, cc; add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); } /* * mul_mont_n without final conditional subtraction, which implies * that modulus is one bit short, which in turn means that there are * no carries to handle between iterations... */ static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], const limb_t p[], limb_t n0, size_t n) { __builtin_assume(n != 0 && n%2 == 0); llimb_t limbx; limb_t mx, hi, tmp[n+1]; size_t i, j; for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; tmp[i] = hi; for (j=0; ; ) { limbx = (mx * (llimb_t)p[0]) + tmp[0]; hi = (limb_t)(limbx >> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } tmp[i-1] = tmp[i] + hi; if (++j==n) break; for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; tmp[i] = hi; } vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); } void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, const vec384 p, limb_t n0, const vec384 b) { __builtin_assume(count != 0); while(count--) { mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); a = ret; } mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); } void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0) { llimb_t limbx; limb_t mask, carry, borrow; size_t i; vec384 t0, t1; /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ for (carry=0, i=0; i> LIMB_T_BITS); } /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; launder(mask); /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ for (carry=0, i=0; i>(LIMB_T_BITS-1); } /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); /* account for t1's sign... */ for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; launder(mask); for (carry=0, i=0; i> LIMB_T_BITS); } } #if defined(__GNUC__) || defined(__clang__) # define MSB(x) ({ limb_t ret = (x) >> (LIMB_T_BITS-1); launder(ret); ret; }) #else # define MSB(x) ((x) >> (LIMB_T_BITS-1)) #endif static size_t num_bits(limb_t l) { limb_t x, mask; size_t bits = is_zero(l) ^ 1; if (sizeof(limb_t) == 8) { x = l >> (32 & (8*sizeof(limb_t)-1)); mask = 0 - MSB(0 - x); bits += 32 & mask; l ^= (x ^ l) & mask; } x = l >> 16; mask = 0 - MSB(0 - x); bits += 16 & mask; l ^= (x ^ l) & mask; x = l >> 8; mask = 0 - MSB(0 - x); bits += 8 & mask; l ^= (x ^ l) & mask; x = l >> 4; mask = 0 - MSB(0 - x); bits += 4 & mask; l ^= (x ^ l) & mask; x = l >> 2; mask = 0 - MSB(0 - x); bits += 2 & mask; l ^= (x ^ l) & mask; bits += l >> 1; return bits; } #if defined(__clang_major__) && __clang_major__>7 __attribute__((optnone)) #endif static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) { size_t r = LIMB_T_BITS - l; limb_t mask = 0 - (is_zero(l)^1); return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); } /* * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. */ static void ab_approximation_n(limb_t a_[2], const limb_t a[], limb_t b_[2], const limb_t b[], size_t n) { __builtin_assume(n != 0 && n%2 == 0); limb_t a_hi, a_lo, b_hi, b_lo, mask; size_t i; i = n-1; a_hi = a[i], a_lo = a[i-1]; b_hi = b[i], b_lo = b[i-1]; for (i--; --i;) { mask = 0 - is_zero(a_hi | b_hi); a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; } i = LIMB_T_BITS - num_bits(a_hi | b_hi); /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); } typedef struct { limb_t f0, g0, f1, g1; } factors; static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], size_t n) { __builtin_assume(n != 0); llimb_t limbx; limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; a_lo = a_[0], a_hi = a_[1]; b_lo = b_[0], b_hi = b_[1]; while(n--) { odd = 0 - (a_lo&1); /* a_ -= b_ if a_ is odd */ t_lo = a_lo, t_hi = a_hi; limbx = a_lo - (llimb_t)(b_lo & odd); a_lo = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); a_hi = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS); /* negate a_-b_ if it borrowed */ a_lo ^= borrow; a_hi ^= borrow; limbx = a_lo + (llimb_t)(borrow & 1); a_lo = (limb_t)limbx; a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; /* b_=a_ if a_-b_ borrowed */ b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; /* exchange f0 and f1 if a_-b_ borrowed */ xorm = (f0 ^ f1) & borrow; f0 ^= xorm; f1 ^= xorm; /* exchange g0 and g1 if a_-b_ borrowed */ xorm = (g0 ^ g1) & borrow; g0 ^= xorm; g1 ^= xorm; /* subtract if a_ was odd */ f0 -= f1 & odd; g0 -= g1 & odd; f1 <<= 1; g1 <<= 1; a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); a_hi >>= 1; } fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; } static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) { __builtin_assume(n != 0); llimb_t limbx = 0; limb_t carry; size_t i; for (carry=neg&1, i=0; i> LIMB_T_BITS); } return 0 - MSB((limb_t)limbx); } static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) { __builtin_assume(n != 0); llimb_t limbx; limb_t carry; size_t i; for (carry=0, i=0; i> LIMB_T_BITS); } return carry; } static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) { __builtin_assume(n != 0); llimb_t limbx; limb_t hi; size_t i; for (hi=0, i=0; i> LIMB_T_BITS); } return hi; } static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, const limb_t b[], limb_t *g_, size_t n) { __builtin_assume(n != 0); limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; size_t i; /* |a|*|f_| */ f = *f_; neg = 0 - MSB(f); f = (f ^ neg) - neg; /* ensure |f| is positive */ (void)cneg_n(a_, a, neg, n); hi = umul_n(a_, a_, f, n); a_[n] = hi - (f & neg); /* |b|*|g_| */ g = *g_; neg = 0 - MSB(g); g = (g ^ neg) - neg; /* ensure |g| is positive */ (void)cneg_n(b_, b, neg, n); hi = umul_n(b_, b_, g, n); b_[n] = hi - (g & neg); /* |a|*|f_| + |b|*|g_| */ (void)add_n(a_, a_, b_, n+1); /* (|a|*|f_| + |b|*|g_|) >> k */ for (carry=a_[0], i=0; i> (LIMB_T_BITS-2); carry = a_[i+1]; ret[i] = hi | (carry << 2); } /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ neg = 0 - MSB(carry); *f_ = (*f_ ^ neg) - neg; *g_ = (*g_ ^ neg) - neg; (void)cneg_n(ret, ret, neg, n); return neg; } static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, const limb_t v[], limb_t g, size_t n) { __builtin_assume(n != 0); limb_t u_[n], v_[n], neg, hi; /* |u|*|f_| */ neg = 0 - MSB(f); f = (f ^ neg) - neg; /* ensure |f| is positive */ neg = cneg_n(u_, u, neg, n); hi = umul_n(u_, u_, f, n) - (f&neg); /* |v|*|g_| */ neg = 0 - MSB(g); g = (g ^ neg) - neg; /* ensure |g| is positive */ neg = cneg_n(v_, v, neg, n); hi += umul_n(v_, v_, g, n) - (g&neg); /* |u|*|f_| + |v|*|g_| */ hi += add_n(ret, u_, v_, n); return hi; } static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], const limb_t mod[], const limb_t modx[], size_t n) { __builtin_assume(n != 0 && n%2 == 0); llimb_t limbx; limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; limb_t a_[2], b_[2], sign, carry, top; factors fg; size_t i; vec_copy(a, inp, sizeof(a)); vec_copy(b, mod, sizeof(b)); vec_zero(u, sizeof(u)); u[0] = 1; vec_zero(v, sizeof(v)); for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { ab_approximation_n(a_, a, b_, b, n); inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); vec_copy(a, t, sizeof(a)); smul_2n(t, u, fg.f0, v, fg.g0, 2*n); smul_2n(v, u, fg.f1, v, fg.g1, 2*n); vec_copy(u, t, sizeof(u)); } inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); sign = 0 - MSB(top); /* top is 1, 0 or -1 */ for (carry=0, i=0; i> LIMB_T_BITS); } top += carry; sign = 0 - top; /* top is 1, 0 or -1 */ top |= sign; for (i=0; i> LIMB_T_BITS) & 1; limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); a_hi = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS); L += ((t_lo & b_lo) >> 1) & borrow; /* negate a_-b_ if it borrowed */ a_lo ^= borrow; a_hi ^= borrow; limbx = a_lo + (llimb_t)(borrow & 1); a_lo = (limb_t)limbx; a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; /* b_=a_ if a_-b_ borrowed */ b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; /* exchange f0 and f1 if a_-b_ borrowed */ xorm = (f0 ^ f1) & borrow; f0 ^= xorm; f1 ^= xorm; /* exchange g0 and g1 if a_-b_ borrowed */ xorm = (g0 ^ g1) & borrow; g0 ^= xorm; g1 ^= xorm; /* subtract if a_ was odd */ f0 -= f1 & odd; g0 -= g1 & odd; f1 <<= 1; g1 <<= 1; a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); a_hi >>= 1; L += (b_lo + 2) >> 2; } fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; return L; } static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) { __builtin_assume(n != 0 && n%2 == 0); limb_t a[n], b[n], t[n]; limb_t a_[2], b_[2], neg, L = 0; factors fg; size_t i; vec_copy(a, inp, sizeof(a)); vec_copy(b, mod, sizeof(b)); for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { ab_approximation_n(a_, a, b_, b, n); L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); vec_copy(a, t, sizeof(a)); L += (b[0] >> 1) & neg; } L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); return (L & 1) ^ 1; } #define CT_IS_SQR_MOD_IMPL(bits) \ inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ const vec##bits mod) \ { return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } CT_IS_SQR_MOD_IMPL(384) /* * |div_top| points at two most significant limbs of the dividend, |d_hi| * and |d_lo| are two most significant limbs of the divisor. If divisor * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. * The divisor is required to be "bitwise left-aligned," and dividend's * top limbs to be not larger than the divisor's. The latter limitation * can be problematic in the first iteration of multi-precision division, * where in most general case the condition would have to be "smaller." * The subroutine considers four limbs, two of which are "overlapping," * hence the name... Another way to look at it is to think of the pair * of the dividend's limbs being suffixed with a zero: * +-------+-------+-------+ * R | | | 0 | * +-------+-------+-------+ * +-------+-------+ * D | | | * +-------+-------+ */ limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) { llimb_t Rx; limb_t r_lo = div_top[0], r_hi = div_top[1]; limb_t Q = 0, mask, borrow, rx; size_t i; for (i = 0; i < LIMB_T_BITS; i++) { /* "borrow, Rx = R - D" */ Rx = (llimb_t)r_lo - d_lo; rx = (limb_t)Rx; borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; Rx = r_hi - (d_hi + (llimb_t)borrow); borrow = (limb_t)(Rx >> LIMB_T_BITS); /* "if (R >= D) R -= D" */ r_lo = ((r_lo ^ rx) & borrow) ^ rx; rx = (limb_t)Rx; r_hi = ((r_hi ^ rx) & borrow) ^ rx; Q <<= 1; Q |= ~borrow & 1; /* "D >>= 1" */ d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); d_hi >>= 1; } mask = 0 - MSB(Q); /* does it overflow? */ /* "borrow, Rx = R - D" */ Rx = (llimb_t)r_lo - d_lo; rx = (limb_t)Rx; borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; Rx = r_hi - (d_hi + (llimb_t)borrow); borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; Q <<= 1; Q |= borrow ^ 1; return (Q | mask); } static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, limb_t quotient, size_t n) { __builtin_assume(n != 0 && n%2 == 0); llimb_t limbx; limb_t tmp[n+1], carry, mask, borrow; size_t i; /* divisor*quotient */ for (carry=0, i=0; i> LIMB_T_BITS); } tmp[i] = carry; /* remainder = dividend - divisor*quotient */ for (borrow=0, i=0; i<=n; i++) { limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); tmp[i] = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; } mask = 0 - borrow; launder(mask); /* if quotient was off by one, add divisor to the remainder */ for (carry=0, i=0; i> LIMB_T_BITS) & 1; } return (div_rem[i] = quotient + mask); } inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, limb_t quotient) { return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, limb_t quotient) { return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } /* * Unlock reference implementations in vect.c */ #define mul_by_8_mod_384 mul_by_8_mod_384 #define mul_by_8_mod_384x mul_by_8_mod_384x #define mul_by_3_mod_384x mul_by_3_mod_384x #define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x #define add_mod_384x add_mod_384x #define sub_mod_384x sub_mod_384x #define lshift_mod_384x lshift_mod_384x #define sqr_mont_384x sqr_mont_384x inline void vec_prefetch(const void *ptr, size_t len) { (void)ptr; (void)len; } /* * SHA-256 */ #define ROTR(x,n) ((x)>>n | (x)<<(32-n)) #define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) #define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) #define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) #define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) void blst_sha256_block_data_order(unsigned int *v, const void *inp, size_t blocks) { static const unsigned int K256[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }; unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; const unsigned char *data = inp; size_t round; a = v[0]; b = v[1]; c = v[2]; d = v[3]; e = v[4]; f = v[5]; g = v[6]; h = v[7]; while (blocks--) { for (round = 0; round < 16; round++) { l = (unsigned int)data[0] << 24; l |= (unsigned int)data[1] << 16; l |= (unsigned int)data[2] << 8; l |= (unsigned int)data[3]; data += 4; T1 = X[round] = l; T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; T2 = Sigma0(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; } for (; round < 64; round++) { s0 = X[(round + 1) & 0x0f]; s0 = sigma0(s0); s1 = X[(round + 14) & 0x0f]; s1 = sigma1(s1); T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; T2 = Sigma0(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; } a += v[0]; v[0] = a; b += v[1]; v[1] = b; c += v[2]; v[2] = c; d += v[3]; v[3] = d; e += v[4]; v[4] = e; f += v[5]; v[5] = f; g += v[6]; v[6] = g; h += v[7]; v[7] = h; } } #undef ROTR #undef Sigma0 #undef Sigma1 #undef sigma0 #undef sigma1 #undef Ch #undef Maj void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) { size_t i; for (i=0; i<8; i++) dst[i] = src[i]; } void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) { size_t i; for (i=0; i<8; i++, md+=4) { unsigned int h_i = h[i]; md[0] = (unsigned char)(h_i >> 24); md[1] = (unsigned char)(h_i >> 16); md[2] = (unsigned char)(h_i >> 8); md[3] = (unsigned char)h_i; } } void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) { unsigned char *dst = dst_; const unsigned char *src = src_; size_t i; for (i=0; iZ); /* Z1Z1 = Z1^2 */ mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ mul_fp2(S2, Q->Y, R->Z); mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ sub_fp2(H, U2, R->X); /* H = U2-X1 */ sqr_fp2(HH, H); /* HH = H^2 */ add_fp2(I, HH, HH); add_fp2(I, I, I); /* I = 4*HH */ mul_fp2(J, H, I); /* J = H*I */ sub_fp2(r, S2, R->Y); add_fp2(r, r, r); /* r = 2*(S2-Y1) */ mul_fp2(V, R->X, I); /* V = X1*I */ sqr_fp2(T->X, r); sub_fp2(T->X, T->X, J); sub_fp2(T->X, T->X, V); sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ mul_fp2(J, J, R->Y); sub_fp2(T->Y, V, T->X); mul_fp2(T->Y, T->Y, r); sub_fp2(T->Y, T->Y, J); sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ add_fp2(T->Z, R->Z, H); sqr_fp2(T->Z, T->Z); sub_fp2(T->Z, T->Z, Z1Z1); sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ /* * line evaluation */ mul_fp2(I, r, Q->X); mul_fp2(J, Q->Y, T->Z); sub_fp2(I, I, J); add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ #ifdef r # undef r #else vec_copy(line[1], r, sizeof(r)); #endif vec_copy(line[2], T->Z, sizeof(T->Z)); } static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) { vec384x ZZ, A, B, C, D, E, F; /* * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr */ sqr_fp2(A, Q->X); /* A = X1^2 */ sqr_fp2(B, Q->Y); /* B = Y1^2 */ sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ sqr_fp2(C, B); /* C = B^2 */ add_fp2(D, Q->X, B); /* X1+B */ sqr_fp2(D, D); /* (X1+B)^2 */ sub_fp2(D, D, A); /* (X1+B)^2-A */ sub_fp2(D, D, C); /* (X1+B)^2-A-C */ add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ mul_by_3_fp2(E, A); /* E = 3*A */ sqr_fp2(F, E); /* F = E^2 */ add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ sub_fp2(T->X, F, D); sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ add_fp2(T->Z, Q->Y, Q->Z); sqr_fp2(T->Z, T->Z); sub_fp2(T->Z, T->Z, B); sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ mul_by_8_fp2(C, C); /* 8*C */ sub_fp2(T->Y, D, T->X); /* D-X3 */ mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ /* * line evaluation */ sqr_fp2(line[0], line[0]); sub_fp2(line[0], line[0], A); sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ lshift_fp2(B, B, 2); sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ } static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) { mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ mul_fp(line[1][1], line[1][1], Px2->X); mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ mul_fp(line[2][1], line[2][1], Px2->Y); } #if 0 static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, const POINTonE1_affine *Px2, vec384fp6 line, size_t n) { line_add(line, T, T, Q); line_by_Px2(line, Px2); mul_by_xy00z0_fp12(ret, ret, line); while (n--) { sqr_fp12(ret, ret); line_dbl(line, T, T); line_by_Px2(line, Px2); mul_by_xy00z0_fp12(ret, ret, line); } } static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) { #define Q ((const POINTonE2_affine *)Q) POINTonE2 T[1]; POINTonE1_affine Px2[1]; vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ /* Move common expression from line evaluation to line_by_Px2. */ add_fp(Px2->X, P->X, P->X); neg_fp(Px2->X, Px2->X); add_fp(Px2->Y, P->Y, P->Y); vec_copy(T->X, Q->X, 2*sizeof(T->X)); vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); /* first step is ret = 1^2*line, which is replaced with ret = line */ line_dbl(line, T, T); /* 0x2 */ line_by_Px2(line, Px2); vec_zero(ret, sizeof(vec384fp12)); vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ conjugate_fp12(ret); /* account for z being negative */ #undef Q } #endif static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], const POINTonE1_affine Px2[], size_t n) { size_t i; vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ /* first step is ret = 1^2*line, which is replaced with ret = line */ line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); vec_zero(ret, sizeof(vec384fp12)); vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); for (i = 1; i < n; i++) { line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); mul_by_xy00z0_fp12(ret, ret, line); } } static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], const POINTonE2_affine Q[], const POINTonE1_affine Px2[], size_t n, size_t k) { size_t i; vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ for (i = 0; i < n; i++) { line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); mul_by_xy00z0_fp12(ret, ret, line); } while (k--) { sqr_fp12(ret, ret); for (i = 0; i < n; i++) { line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); mul_by_xy00z0_fp12(ret, ret, line); } } } static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], const POINTonE1_affine P[], size_t n) { #if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ || defined(__STDC_NO_VLA__) POINTonE2 *T = alloca(n*sizeof(POINTonE2)); POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); #else POINTonE2 T[n]; POINTonE1_affine Px2[n]; #endif size_t i; if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) | vec_is_zero(&P[0], sizeof(P[0]))) ) { /* * Special case of infinite aggregated signature, pair the additive * group's identity with the multiplicative group's identity. */ vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12)); return; } for (i = 0; i < n; i++) { /* Move common expression from line evaluation to line_by_Px2. */ add_fp(Px2[i].X, P[i].X, P[i].X); neg_fp(Px2[i].X, Px2[i].X); add_fp(Px2[i].Y, P[i].Y, P[i].Y); vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); } /* first step is ret = 1^2*line, which is replaced with ret = line */ start_dbl_n(ret, T, Px2, n); /* 0x2 */ add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ conjugate_fp12(ret); /* account for z being negative */ } static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, const POINTonE2_affine *Q, size_t n) { line_add(lines++[0], T, T, Q); while (n--) line_dbl(lines++[0], T, T); } static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) { POINTonE2 T[1]; vec_copy(T->X, Q->X, 2*sizeof(T->X)); vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); line_dbl(Qlines[0], T, T); /* 0x2 */ pre_add_n_dbl(&Qlines[1], T, Q, 2); /* ..0xc */ pre_add_n_dbl(&Qlines[4], T, Q, 3); /* ..0x68 */ pre_add_n_dbl(&Qlines[8], T, Q, 9); /* ..0xd200 */ pre_add_n_dbl(&Qlines[18], T, Q, 32); /* ..0xd20100000000 */ pre_add_n_dbl(&Qlines[51], T, Q, 16); /* ..0xd201000000010000 */ } static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, const POINTonE1_affine *Px2) { vec_copy(out[0], in[0], sizeof(out[0])); mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ mul_fp(out[1][1], in[1][1], Px2->X); mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ mul_fp(out[2][1], in[2][1], Px2->Y); } static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], const POINTonE1_affine *Px2, size_t n) { vec384fp6 line; post_line_by_Px2(line, lines++[0], Px2); mul_by_xy00z0_fp12(ret, ret, line); while (n--) { sqr_fp12(ret, ret); post_line_by_Px2(line, lines++[0], Px2); mul_by_xy00z0_fp12(ret, ret, line); } } static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], const POINTonE1_affine *P) { POINTonE1_affine Px2[1]; vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ /* Move common expression from line evaluation to line_by_Px2. */ add_fp(Px2->X, P->X, P->X); neg_fp(Px2->X, Px2->X); add_fp(Px2->Y, P->Y, P->Y); /* first step is ret = 1^2*line, which is replaced with ret = line */ post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ vec_zero(ret, sizeof(vec384fp12)); vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); post_add_n_dbl(ret, &Qlines[1], Px2, 2); /* ..0xc */ post_add_n_dbl(ret, &Qlines[4], Px2, 3); /* ..0x68 */ post_add_n_dbl(ret, &Qlines[8], Px2, 9); /* ..0xd200 */ post_add_n_dbl(ret, &Qlines[18], Px2, 32); /* ..0xd20100000000 */ post_add_n_dbl(ret, &Qlines[51], Px2, 16); /* ..0xd201000000010000 */ conjugate_fp12(ret); /* account for z being negative */ } #ifdef INTERNAL_TESTMODE static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, const POINTonE1_affine *P) { vec384fp6 lines[68]; precompute_lines(lines, Q); miller_loop_lines(ret, lines, P); } #endif static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) { mul_fp12(ret, ret, a); while (n--) cyclotomic_sqr_fp12(ret, ret); } static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) { cyclotomic_sqr_fp12(ret, a); /* 0x2 */ mul_n_sqr(ret, a, 2); /* ..0xc */ mul_n_sqr(ret, a, 3); /* ..0x68 */ mul_n_sqr(ret, a, 9); /* ..0xd200 */ mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ conjugate_fp12(ret); /* account for z being negative */ } #define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) /* * Adaptation from /pairing/src/bls12_381/mod.rs */ static void final_exp(vec384fp12 ret, const vec384fp12 f) { vec384fp12 y0, y1, y2, y3; vec_copy(y1, f, sizeof(y1)); conjugate_fp12(y1); inverse_fp12(y2, f); mul_fp12(ret, y1, y2); frobenius_map_fp12(y2, ret, 2); mul_fp12(ret, ret, y2); cyclotomic_sqr_fp12(y0, ret); raise_to_z(y1, y0); raise_to_z_div_by_2(y2, y1); vec_copy(y3, ret, sizeof(y3)); conjugate_fp12(y3); mul_fp12(y1, y1, y3); conjugate_fp12(y1); mul_fp12(y1, y1, y2); raise_to_z(y2, y1); raise_to_z(y3, y2); conjugate_fp12(y1); mul_fp12(y3, y3, y1); conjugate_fp12(y1); frobenius_map_fp12(y1, y1, 3); frobenius_map_fp12(y2, y2, 2); mul_fp12(y1, y1, y2); raise_to_z(y2, y3); mul_fp12(y2, y2, y0); mul_fp12(y2, y2, ret); mul_fp12(y1, y1, y2); frobenius_map_fp12(y2, y3, 1); mul_fp12(ret, y1, y2); } void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, const POINTonE1_affine *P) { miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2, P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); } #ifndef MILLER_LOOP_N_MAX # define MILLER_LOOP_N_MAX 16 #endif void blst_miller_loop_n(vec384fp12 out, const POINTonE2_affine *const Qs[], const POINTonE1_affine *const Ps[], size_t n) { /* ~10KB of stack storage */ POINTonE2 T[MILLER_LOOP_N_MAX]; POINTonE2_affine Q[MILLER_LOOP_N_MAX]; POINTonE1_affine Px2[MILLER_LOOP_N_MAX]; const POINTonE2_affine *Qptr = NULL; const POINTonE1_affine *Pptr = NULL; size_t i, j; for (i = 0, j = 0; j < n; j++) { Qptr = *Qs ? *Qs++ : Qptr+1; Pptr = *Ps ? *Ps++ : Pptr+1; /* Move common expression from line evaluation to line_by_Px2. */ add_fp(Px2[i].X, Pptr->X, Pptr->X); neg_fp(Px2[i].X, Px2[i].X); add_fp(Px2[i].Y, Pptr->Y, Pptr->Y); vec_copy(Q[i].X, Qptr->X, 2*sizeof(Q[i].X)); vec_copy(T[i].X, Qptr->X, 2*sizeof(T[i].X)); vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); if (++i == MILLER_LOOP_N_MAX || j == n-1) { vec384fp12 tmp; vec384fp6 *ret = j < MILLER_LOOP_N_MAX ? out : tmp; /* first step is ret = 1^2*line, which is just ret = line */ start_dbl_n(ret, T, Px2, i); /* 0x2 */ add_n_dbl_n(ret, T, Q, Px2, i, 2); /* ..0xc */ add_n_dbl_n(ret, T, Q, Px2, i, 3); /* ..0x68 */ add_n_dbl_n(ret, T, Q, Px2, i, 9); /* ..0xd200 */ add_n_dbl_n(ret, T, Q, Px2, i, 32); /* ..0xd20100000000 */ add_n_dbl_n(ret, T, Q, Px2, i, 16); /* ..0xd201000000010000 */ conjugate_fp12(ret); /* account for z being negative */ if (j >= MILLER_LOOP_N_MAX) mul_fp12(out, out, ret); i = 0; } } } void blst_final_exp(vec384fp12 ret, const vec384fp12 f) { final_exp(ret, f); } void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) { precompute_lines(Qlines, Q); } void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], const POINTonE1_affine *P) { miller_loop_lines(ret, Qlines, P); } static bool_t is_cyclotomic(const vec384fp12 f) { vec384fp12 a, b; frobenius_map_fp12(a, f, 2); frobenius_map_fp12(b, a, 2); mul_fp12(b, b, f); return vec_is_equal(a, b, sizeof(a)); } int blst_fp12_in_group(const vec384fp12 f) { vec384fp12 a, b; if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f)) return 0; frobenius_map_fp12(a, f, 1); raise_to_z(b, f); return (int)vec_is_equal(a, b, sizeof(a)); } ================================================ FILE: src/pentaroot-addchain.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ /* * The "magic" number is 1/5 modulo BLS12_381_r-1. Exponentiation to which * yields 5th root of the base. * * Generated with 'addchain 20974350070050476191779096203274386335076221000211055129041463479975432473805' * https://github.com/kwantam/addchain * # Bos-Coster (win=4) : 307 (15) * # Bos-Coster (win=10) : 307 (18) * # Yacobi : 319 (16) * # Bos-Coster (win=2) : 319 ( 5) * # Bos-Coster (win=5) : 306 (19) <<< * # Bos-Coster (win=7) : 311 (22) * # Bos-Coster (win=9) : 313 (20) * # Bos-Coster (win=3) : 314 ( 9) * # Bos-Coster (win=6) : 309 (21) * # Bos-Coster (win=8) : 309 (23) * # Bergeron-Berstel-Brlek-Duboc : 334 ( 5) */ #define PENTAROOT_MOD_BLS12_381_r(out, inp, ptype) do { \ ptype t[19]; \ vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ sqr(t[7], t[1]); /* 1: 2 */\ sqr(t[0], t[7]); /* 2: 4 */\ sqr(t[2], t[0]); /* 3: 8 */\ mul(t[10], t[2], t[1]); /* 4: 9 */\ mul(t[3], t[10], t[7]); /* 5: b */\ mul(t[1], t[10], t[0]); /* 6: d */\ mul(t[5], t[3], t[0]); /* 7: f */\ mul(t[9], t[10], t[2]); /* 8: 11 */\ mul(t[4], t[3], t[2]); /* 9: 13 */\ mul(t[15], t[5], t[2]); /* 10: 17 */\ mul(t[8], t[15], t[2]); /* 11: 1f */\ mul(t[13], t[8], t[7]); /* 12: 21 */\ mul(t[14], t[8], t[0]); /* 13: 23 */\ mul(t[12], t[13], t[0]); /* 14: 25 */\ mul(t[6], t[8], t[2]); /* 15: 27 */\ mul(t[11], t[14], t[2]); /* 16: 2b */\ sqr(t[0], t[15]); /* 17: 2e */\ mul(t[18], t[6], t[2]); /* 18: 2f */\ mul(t[2], t[11], t[2]); /* 19: 33 */\ mul(t[16], t[2], t[7]); /* 20: 35 */\ mul(t[7], t[0], t[3]); /* 21: 39 */\ mul(t[17], t[0], t[5]); /* 22: 3d */\ /* sqr(t[0], t[0]); */ /* 23: 5c */\ /* sqr(t[0], t[0]); */ /* 24: b8 */\ /* sqr(t[0], t[0]); */ /* 25: 170 */\ /* sqr(t[0], t[0]); */ /* 26: 2e0 */\ /* sqr(t[0], t[0]); */ /* 27: 5c0 */\ /* sqr(t[0], t[0]); */ /* 28: b80 */\ /* sqr(t[0], t[0]); */ /* 29: 1700 */\ sqr_n_mul(t[0], t[0], 7, t[18]); /* 30: 172f */\ /* sqr(t[0], t[0]); */ /* 31: 2e5e */\ /* sqr(t[0], t[0]); */ /* 32: 5cbc */\ /* sqr(t[0], t[0]); */ /* 33: b978 */\ /* sqr(t[0], t[0]); */ /* 34: 172f0 */\ /* sqr(t[0], t[0]); */ /* 35: 2e5e0 */\ /* sqr(t[0], t[0]); */ /* 36: 5cbc0 */\ sqr_n_mul(t[0], t[0], 6, t[13]); /* 37: 5cbe1 */\ /* sqr(t[0], t[0]); */ /* 38: b97c2 */\ /* sqr(t[0], t[0]); */ /* 39: 172f84 */\ /* sqr(t[0], t[0]); */ /* 40: 2e5f08 */\ /* sqr(t[0], t[0]); */ /* 41: 5cbe10 */\ /* sqr(t[0], t[0]); */ /* 42: b97c20 */\ /* sqr(t[0], t[0]); */ /* 43: 172f840 */\ sqr_n_mul(t[0], t[0], 6, t[17]); /* 44: 172f87d */\ /* sqr(t[0], t[0]); */ /* 45: 2e5f0fa */\ /* sqr(t[0], t[0]); */ /* 46: 5cbe1f4 */\ /* sqr(t[0], t[0]); */ /* 47: b97c3e8 */\ /* sqr(t[0], t[0]); */ /* 48: 172f87d0 */\ /* sqr(t[0], t[0]); */ /* 49: 2e5f0fa0 */\ /* sqr(t[0], t[0]); */ /* 50: 5cbe1f40 */\ sqr_n_mul(t[0], t[0], 6, t[16]); /* 51: 5cbe1f75 */\ /* sqr(t[0], t[0]); */ /* 52: b97c3eea */\ /* sqr(t[0], t[0]); */ /* 53: 172f87dd4 */\ /* sqr(t[0], t[0]); */ /* 54: 2e5f0fba8 */\ /* sqr(t[0], t[0]); */ /* 55: 5cbe1f750 */\ /* sqr(t[0], t[0]); */ /* 56: b97c3eea0 */\ sqr_n_mul(t[0], t[0], 5, t[15]); /* 57: b97c3eeb7 */\ /* sqr(t[0], t[0]); */ /* 58: 172f87dd6e */\ /* sqr(t[0], t[0]); */ /* 59: 2e5f0fbadc */\ /* sqr(t[0], t[0]); */ /* 60: 5cbe1f75b8 */\ /* sqr(t[0], t[0]); */ /* 61: b97c3eeb70 */\ /* sqr(t[0], t[0]); */ /* 62: 172f87dd6e0 */\ /* sqr(t[0], t[0]); */ /* 63: 2e5f0fbadc0 */\ sqr_n_mul(t[0], t[0], 6, t[15]); /* 64: 2e5f0fbadd7 */\ /* sqr(t[0], t[0]); */ /* 65: 5cbe1f75bae */\ /* sqr(t[0], t[0]); */ /* 66: b97c3eeb75c */\ /* sqr(t[0], t[0]); */ /* 67: 172f87dd6eb8 */\ /* sqr(t[0], t[0]); */ /* 68: 2e5f0fbadd70 */\ /* sqr(t[0], t[0]); */ /* 69: 5cbe1f75bae0 */\ /* sqr(t[0], t[0]); */ /* 70: b97c3eeb75c0 */\ /* sqr(t[0], t[0]); */ /* 71: 172f87dd6eb80 */\ /* sqr(t[0], t[0]); */ /* 72: 2e5f0fbadd700 */\ sqr_n_mul(t[0], t[0], 8, t[14]); /* 73: 2e5f0fbadd723 */\ /* sqr(t[0], t[0]); */ /* 74: 5cbe1f75bae46 */\ /* sqr(t[0], t[0]); */ /* 75: b97c3eeb75c8c */\ /* sqr(t[0], t[0]); */ /* 76: 172f87dd6eb918 */\ /* sqr(t[0], t[0]); */ /* 77: 2e5f0fbadd7230 */\ /* sqr(t[0], t[0]); */ /* 78: 5cbe1f75bae460 */\ /* sqr(t[0], t[0]); */ /* 79: b97c3eeb75c8c0 */\ /* sqr(t[0], t[0]); */ /* 80: 172f87dd6eb9180 */\ /* sqr(t[0], t[0]); */ /* 81: 2e5f0fbadd72300 */\ sqr_n_mul(t[0], t[0], 8, t[13]); /* 82: 2e5f0fbadd72321 */\ /* sqr(t[0], t[0]); */ /* 83: 5cbe1f75bae4642 */\ /* sqr(t[0], t[0]); */ /* 84: b97c3eeb75c8c84 */\ /* sqr(t[0], t[0]); */ /* 85: 172f87dd6eb91908 */\ /* sqr(t[0], t[0]); */ /* 86: 2e5f0fbadd723210 */\ /* sqr(t[0], t[0]); */ /* 87: 5cbe1f75bae46420 */\ /* sqr(t[0], t[0]); */ /* 88: b97c3eeb75c8c840 */\ sqr_n_mul(t[0], t[0], 6, t[2]); /* 89: b97c3eeb75c8c873 */\ /* sqr(t[0], t[0]); */ /* 90: 172f87dd6eb9190e6 */\ /* sqr(t[0], t[0]); */ /* 91: 2e5f0fbadd72321cc */\ /* sqr(t[0], t[0]); */ /* 92: 5cbe1f75bae464398 */\ /* sqr(t[0], t[0]); */ /* 93: b97c3eeb75c8c8730 */\ /* sqr(t[0], t[0]); */ /* 94: 172f87dd6eb9190e60 */\ /* sqr(t[0], t[0]); */ /* 95: 2e5f0fbadd72321cc0 */\ sqr_n_mul(t[0], t[0], 6, t[13]); /* 96: 2e5f0fbadd72321ce1 */\ /* sqr(t[0], t[0]); */ /* 97: 5cbe1f75bae46439c2 */\ /* sqr(t[0], t[0]); */ /* 98: b97c3eeb75c8c87384 */\ /* sqr(t[0], t[0]); */ /* 99: 172f87dd6eb9190e708 */\ /* sqr(t[0], t[0]); */ /* 100: 2e5f0fbadd72321ce10 */\ /* sqr(t[0], t[0]); */ /* 101: 5cbe1f75bae46439c20 */\ /* sqr(t[0], t[0]); */ /* 102: b97c3eeb75c8c873840 */\ /* sqr(t[0], t[0]); */ /* 103: 172f87dd6eb9190e7080 */\ sqr_n_mul(t[0], t[0], 7, t[12]); /* 104: 172f87dd6eb9190e70a5 */\ /* sqr(t[0], t[0]); */ /* 105: 2e5f0fbadd72321ce14a */\ /* sqr(t[0], t[0]); */ /* 106: 5cbe1f75bae46439c294 */\ /* sqr(t[0], t[0]); */ /* 107: b97c3eeb75c8c8738528 */\ /* sqr(t[0], t[0]); */ /* 108: 172f87dd6eb9190e70a50 */\ /* sqr(t[0], t[0]); */ /* 109: 2e5f0fbadd72321ce14a0 */\ /* sqr(t[0], t[0]); */ /* 110: 5cbe1f75bae46439c2940 */\ /* sqr(t[0], t[0]); */ /* 111: b97c3eeb75c8c87385280 */\ /* sqr(t[0], t[0]); */ /* 112: 172f87dd6eb9190e70a500 */\ sqr_n_mul(t[0], t[0], 8, t[11]); /* 113: 172f87dd6eb9190e70a52b */\ /* sqr(t[0], t[0]); */ /* 114: 2e5f0fbadd72321ce14a56 */\ /* sqr(t[0], t[0]); */ /* 115: 5cbe1f75bae46439c294ac */\ /* sqr(t[0], t[0]); */ /* 116: b97c3eeb75c8c873852958 */\ /* sqr(t[0], t[0]); */ /* 117: 172f87dd6eb9190e70a52b0 */\ /* sqr(t[0], t[0]); */ /* 118: 2e5f0fbadd72321ce14a560 */\ /* sqr(t[0], t[0]); */ /* 119: 5cbe1f75bae46439c294ac0 */\ sqr_n_mul(t[0], t[0], 6, t[1]); /* 120: 5cbe1f75bae46439c294acd */\ /* sqr(t[0], t[0]); */ /* 121: b97c3eeb75c8c873852959a */\ /* sqr(t[0], t[0]); */ /* 122: 172f87dd6eb9190e70a52b34 */\ /* sqr(t[0], t[0]); */ /* 123: 2e5f0fbadd72321ce14a5668 */\ /* sqr(t[0], t[0]); */ /* 124: 5cbe1f75bae46439c294acd0 */\ /* sqr(t[0], t[0]); */ /* 125: b97c3eeb75c8c873852959a0 */\ /* sqr(t[0], t[0]); */ /* 126: 172f87dd6eb9190e70a52b340 */\ /* sqr(t[0], t[0]); */ /* 127: 2e5f0fbadd72321ce14a56680 */\ /* sqr(t[0], t[0]); */ /* 128: 5cbe1f75bae46439c294acd00 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 129: 5cbe1f75bae46439c294acd33 */\ /* sqr(t[0], t[0]); */ /* 130: b97c3eeb75c8c873852959a66 */\ /* sqr(t[0], t[0]); */ /* 131: 172f87dd6eb9190e70a52b34cc */\ /* sqr(t[0], t[0]); */ /* 132: 2e5f0fbadd72321ce14a566998 */\ /* sqr(t[0], t[0]); */ /* 133: 5cbe1f75bae46439c294acd330 */\ /* sqr(t[0], t[0]); */ /* 134: b97c3eeb75c8c873852959a660 */\ /* sqr(t[0], t[0]); */ /* 135: 172f87dd6eb9190e70a52b34cc0 */\ sqr_n_mul(t[0], t[0], 6, t[11]); /* 136: 172f87dd6eb9190e70a52b34ceb */\ /* sqr(t[0], t[0]); */ /* 137: 2e5f0fbadd72321ce14a56699d6 */\ /* sqr(t[0], t[0]); */ /* 138: 5cbe1f75bae46439c294acd33ac */\ /* sqr(t[0], t[0]); */ /* 139: b97c3eeb75c8c873852959a6758 */\ /* sqr(t[0], t[0]); */ /* 140: 172f87dd6eb9190e70a52b34ceb0 */\ sqr_n_mul(t[0], t[0], 4, t[10]); /* 141: 172f87dd6eb9190e70a52b34ceb9 */\ /* sqr(t[0], t[0]); */ /* 142: 2e5f0fbadd72321ce14a56699d72 */\ /* sqr(t[0], t[0]); */ /* 143: 5cbe1f75bae46439c294acd33ae4 */\ /* sqr(t[0], t[0]); */ /* 144: b97c3eeb75c8c873852959a675c8 */\ /* sqr(t[0], t[0]); */ /* 145: 172f87dd6eb9190e70a52b34ceb90 */\ /* sqr(t[0], t[0]); */ /* 146: 2e5f0fbadd72321ce14a56699d720 */\ sqr_n_mul(t[0], t[0], 5, t[8]); /* 147: 2e5f0fbadd72321ce14a56699d73f */\ /* sqr(t[0], t[0]); */ /* 148: 5cbe1f75bae46439c294acd33ae7e */\ /* sqr(t[0], t[0]); */ /* 149: b97c3eeb75c8c873852959a675cfc */\ /* sqr(t[0], t[0]); */ /* 150: 172f87dd6eb9190e70a52b34ceb9f8 */\ /* sqr(t[0], t[0]); */ /* 151: 2e5f0fbadd72321ce14a56699d73f0 */\ /* sqr(t[0], t[0]); */ /* 152: 5cbe1f75bae46439c294acd33ae7e0 */\ /* sqr(t[0], t[0]); */ /* 153: b97c3eeb75c8c873852959a675cfc0 */\ /* sqr(t[0], t[0]); */ /* 154: 172f87dd6eb9190e70a52b34ceb9f80 */\ /* sqr(t[0], t[0]); */ /* 155: 2e5f0fbadd72321ce14a56699d73f00 */\ /* sqr(t[0], t[0]); */ /* 156: 5cbe1f75bae46439c294acd33ae7e00 */\ /* sqr(t[0], t[0]); */ /* 157: b97c3eeb75c8c873852959a675cfc00 */\ /* sqr(t[0], t[0]); */ /* 158: 172f87dd6eb9190e70a52b34ceb9f800 */\ /* sqr(t[0], t[0]); */ /* 159: 2e5f0fbadd72321ce14a56699d73f000 */\ /* sqr(t[0], t[0]); */ /* 160: 5cbe1f75bae46439c294acd33ae7e000 */\ /* sqr(t[0], t[0]); */ /* 161: b97c3eeb75c8c873852959a675cfc000 */\ /* sqr(t[0], t[0]); */ /* 162: 172f87dd6eb9190e70a52b34ceb9f8000 */\ sqr_n_mul(t[0], t[0], 15, t[9]); /* 163: 172f87dd6eb9190e70a52b34ceb9f8011 */\ /* sqr(t[0], t[0]); */ /* 164: 2e5f0fbadd72321ce14a56699d73f0022 */\ /* sqr(t[0], t[0]); */ /* 165: 5cbe1f75bae46439c294acd33ae7e0044 */\ /* sqr(t[0], t[0]); */ /* 166: b97c3eeb75c8c873852959a675cfc0088 */\ /* sqr(t[0], t[0]); */ /* 167: 172f87dd6eb9190e70a52b34ceb9f80110 */\ /* sqr(t[0], t[0]); */ /* 168: 2e5f0fbadd72321ce14a56699d73f00220 */\ /* sqr(t[0], t[0]); */ /* 169: 5cbe1f75bae46439c294acd33ae7e00440 */\ /* sqr(t[0], t[0]); */ /* 170: b97c3eeb75c8c873852959a675cfc00880 */\ /* sqr(t[0], t[0]); */ /* 171: 172f87dd6eb9190e70a52b34ceb9f801100 */\ sqr_n_mul(t[0], t[0], 8, t[3]); /* 172: 172f87dd6eb9190e70a52b34ceb9f80110b */\ /* sqr(t[0], t[0]); */ /* 173: 2e5f0fbadd72321ce14a56699d73f002216 */\ /* sqr(t[0], t[0]); */ /* 174: 5cbe1f75bae46439c294acd33ae7e00442c */\ /* sqr(t[0], t[0]); */ /* 175: b97c3eeb75c8c873852959a675cfc008858 */\ /* sqr(t[0], t[0]); */ /* 176: 172f87dd6eb9190e70a52b34ceb9f80110b0 */\ /* sqr(t[0], t[0]); */ /* 177: 2e5f0fbadd72321ce14a56699d73f0022160 */\ sqr_n_mul(t[0], t[0], 5, t[8]); /* 178: 2e5f0fbadd72321ce14a56699d73f002217f */\ /* sqr(t[0], t[0]); */ /* 179: 5cbe1f75bae46439c294acd33ae7e00442fe */\ /* sqr(t[0], t[0]); */ /* 180: b97c3eeb75c8c873852959a675cfc00885fc */\ /* sqr(t[0], t[0]); */ /* 181: 172f87dd6eb9190e70a52b34ceb9f80110bf8 */\ /* sqr(t[0], t[0]); */ /* 182: 2e5f0fbadd72321ce14a56699d73f002217f0 */\ /* sqr(t[0], t[0]); */ /* 183: 5cbe1f75bae46439c294acd33ae7e00442fe0 */\ /* sqr(t[0], t[0]); */ /* 184: b97c3eeb75c8c873852959a675cfc00885fc0 */\ /* sqr(t[0], t[0]); */ /* 185: 172f87dd6eb9190e70a52b34ceb9f80110bf80 */\ /* sqr(t[0], t[0]); */ /* 186: 2e5f0fbadd72321ce14a56699d73f002217f00 */\ /* sqr(t[0], t[0]); */ /* 187: 5cbe1f75bae46439c294acd33ae7e00442fe00 */\ /* sqr(t[0], t[0]); */ /* 188: b97c3eeb75c8c873852959a675cfc00885fc00 */\ sqr_n_mul(t[0], t[0], 10, t[7]); /* 189: b97c3eeb75c8c873852959a675cfc00885fc39 */\ /* sqr(t[0], t[0]); */ /* 190: 172f87dd6eb9190e70a52b34ceb9f80110bf872 */\ /* sqr(t[0], t[0]); */ /* 191: 2e5f0fbadd72321ce14a56699d73f002217f0e4 */\ /* sqr(t[0], t[0]); */ /* 192: 5cbe1f75bae46439c294acd33ae7e00442fe1c8 */\ /* sqr(t[0], t[0]); */ /* 193: b97c3eeb75c8c873852959a675cfc00885fc390 */\ /* sqr(t[0], t[0]); */ /* 194: 172f87dd6eb9190e70a52b34ceb9f80110bf8720 */\ /* sqr(t[0], t[0]); */ /* 195: 2e5f0fbadd72321ce14a56699d73f002217f0e40 */\ sqr_n_mul(t[0], t[0], 6, t[6]); /* 196: 2e5f0fbadd72321ce14a56699d73f002217f0e67 */\ /* sqr(t[0], t[0]); */ /* 197: 5cbe1f75bae46439c294acd33ae7e00442fe1cce */\ /* sqr(t[0], t[0]); */ /* 198: b97c3eeb75c8c873852959a675cfc00885fc399c */\ /* sqr(t[0], t[0]); */ /* 199: 172f87dd6eb9190e70a52b34ceb9f80110bf87338 */\ /* sqr(t[0], t[0]); */ /* 200: 2e5f0fbadd72321ce14a56699d73f002217f0e670 */\ /* sqr(t[0], t[0]); */ /* 201: 5cbe1f75bae46439c294acd33ae7e00442fe1cce0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 202: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3 */\ /* sqr(t[0], t[0]); */ /* 203: b97c3eeb75c8c873852959a675cfc00885fc399e6 */\ /* sqr(t[0], t[0]); */ /* 204: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc */\ /* sqr(t[0], t[0]); */ /* 205: 2e5f0fbadd72321ce14a56699d73f002217f0e6798 */\ /* sqr(t[0], t[0]); */ /* 206: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf30 */\ /* sqr(t[0], t[0]); */ /* 207: b97c3eeb75c8c873852959a675cfc00885fc399e60 */\ /* sqr(t[0], t[0]); */ /* 208: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc0 */\ /* sqr(t[0], t[0]); */ /* 209: 2e5f0fbadd72321ce14a56699d73f002217f0e67980 */\ /* sqr(t[0], t[0]); */ /* 210: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf300 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 211: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf333 */\ /* sqr(t[0], t[0]); */ /* 212: b97c3eeb75c8c873852959a675cfc00885fc399e666 */\ /* sqr(t[0], t[0]); */ /* 213: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc */\ /* sqr(t[0], t[0]); */ /* 214: 2e5f0fbadd72321ce14a56699d73f002217f0e679998 */\ /* sqr(t[0], t[0]); */ /* 215: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3330 */\ /* sqr(t[0], t[0]); */ /* 216: b97c3eeb75c8c873852959a675cfc00885fc399e6660 */\ /* sqr(t[0], t[0]); */ /* 217: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc0 */\ /* sqr(t[0], t[0]); */ /* 218: 2e5f0fbadd72321ce14a56699d73f002217f0e6799980 */\ sqr_n_mul(t[0], t[0], 7, t[5]); /* 219: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f */\ /* sqr(t[0], t[0]); */ /* 220: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e */\ /* sqr(t[0], t[0]); */ /* 221: b97c3eeb75c8c873852959a675cfc00885fc399e6663c */\ /* sqr(t[0], t[0]); */ /* 222: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78 */\ /* sqr(t[0], t[0]); */ /* 223: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f0 */\ /* sqr(t[0], t[0]); */ /* 224: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e0 */\ /* sqr(t[0], t[0]); */ /* 225: b97c3eeb75c8c873852959a675cfc00885fc399e6663c0 */\ /* sqr(t[0], t[0]); */ /* 226: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc780 */\ /* sqr(t[0], t[0]); */ /* 227: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f00 */\ /* sqr(t[0], t[0]); */ /* 228: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e00 */\ sqr_n_mul(t[0], t[0], 9, t[2]); /* 229: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33 */\ /* sqr(t[0], t[0]); */ /* 230: b97c3eeb75c8c873852959a675cfc00885fc399e6663c66 */\ /* sqr(t[0], t[0]); */ /* 231: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc */\ /* sqr(t[0], t[0]); */ /* 232: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f198 */\ /* sqr(t[0], t[0]); */ /* 233: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e330 */\ /* sqr(t[0], t[0]); */ /* 234: b97c3eeb75c8c873852959a675cfc00885fc399e6663c660 */\ /* sqr(t[0], t[0]); */ /* 235: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc0 */\ /* sqr(t[0], t[0]); */ /* 236: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1980 */\ sqr_n_mul(t[0], t[0], 7, t[4]); /* 237: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993 */\ /* sqr(t[0], t[0]); */ /* 238: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326 */\ /* sqr(t[0], t[0]); */ /* 239: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c */\ /* sqr(t[0], t[0]); */ /* 240: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc98 */\ /* sqr(t[0], t[0]); */ /* 241: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19930 */\ /* sqr(t[0], t[0]); */ /* 242: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33260 */\ /* sqr(t[0], t[0]); */ /* 243: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c0 */\ /* sqr(t[0], t[0]); */ /* 244: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc980 */\ /* sqr(t[0], t[0]); */ /* 245: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199300 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 246: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333 */\ /* sqr(t[0], t[0]); */ /* 247: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666 */\ /* sqr(t[0], t[0]); */ /* 248: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc */\ /* sqr(t[0], t[0]); */ /* 249: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9998 */\ /* sqr(t[0], t[0]); */ /* 250: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993330 */\ /* sqr(t[0], t[0]); */ /* 251: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326660 */\ /* sqr(t[0], t[0]); */ /* 252: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc0 */\ /* sqr(t[0], t[0]); */ /* 253: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99980 */\ /* sqr(t[0], t[0]); */ /* 254: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933300 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 255: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333 */\ /* sqr(t[0], t[0]); */ /* 256: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666 */\ /* sqr(t[0], t[0]); */ /* 257: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc */\ /* sqr(t[0], t[0]); */ /* 258: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999998 */\ /* sqr(t[0], t[0]); */ /* 259: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333330 */\ /* sqr(t[0], t[0]); */ /* 260: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666660 */\ /* sqr(t[0], t[0]); */ /* 261: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc0 */\ /* sqr(t[0], t[0]); */ /* 262: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999980 */\ /* sqr(t[0], t[0]); */ /* 263: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333300 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 264: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333333 */\ /* sqr(t[0], t[0]); */ /* 265: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666 */\ /* sqr(t[0], t[0]); */ /* 266: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc */\ /* sqr(t[0], t[0]); */ /* 267: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999998 */\ /* sqr(t[0], t[0]); */ /* 268: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333330 */\ /* sqr(t[0], t[0]); */ /* 269: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666660 */\ /* sqr(t[0], t[0]); */ /* 270: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc0 */\ sqr_n_mul(t[0], t[0], 6, t[3]); /* 271: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb */\ /* sqr(t[0], t[0]); */ /* 272: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996 */\ /* sqr(t[0], t[0]); */ /* 273: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c */\ /* sqr(t[0], t[0]); */ /* 274: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666658 */\ /* sqr(t[0], t[0]); */ /* 275: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb0 */\ /* sqr(t[0], t[0]); */ /* 276: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999960 */\ /* sqr(t[0], t[0]); */ /* 277: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c0 */\ /* sqr(t[0], t[0]); */ /* 278: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666580 */\ /* sqr(t[0], t[0]); */ /* 279: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb00 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 280: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33 */\ /* sqr(t[0], t[0]); */ /* 281: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666 */\ /* sqr(t[0], t[0]); */ /* 282: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc */\ /* sqr(t[0], t[0]); */ /* 283: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665998 */\ /* sqr(t[0], t[0]); */ /* 284: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb330 */\ /* sqr(t[0], t[0]); */ /* 285: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996660 */\ /* sqr(t[0], t[0]); */ /* 286: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc0 */\ /* sqr(t[0], t[0]); */ /* 287: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659980 */\ /* sqr(t[0], t[0]); */ /* 288: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3300 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 289: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333 */\ /* sqr(t[0], t[0]); */ /* 290: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666 */\ /* sqr(t[0], t[0]); */ /* 291: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc */\ /* sqr(t[0], t[0]); */ /* 292: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666599998 */\ /* sqr(t[0], t[0]); */ /* 293: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33330 */\ /* sqr(t[0], t[0]); */ /* 294: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666660 */\ /* sqr(t[0], t[0]); */ /* 295: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc0 */\ /* sqr(t[0], t[0]); */ /* 296: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665999980 */\ /* sqr(t[0], t[0]); */ /* 297: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333300 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 298: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333333 */\ /* sqr(t[0], t[0]); */ /* 299: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996666666 */\ /* sqr(t[0], t[0]); */ /* 300: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc */\ /* sqr(t[0], t[0]); */ /* 301: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659999998 */\ /* sqr(t[0], t[0]); */ /* 302: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333330 */\ /* sqr(t[0], t[0]); */ /* 303: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666660 */\ /* sqr(t[0], t[0]); */ /* 304: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc0 */\ sqr_n_mul(out, t[0], 6, t[1]); /* 305: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332cccccccd */\ } while(0) ================================================ FILE: src/pentaroot.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "fields.h" static inline void mul_fr(vec256 ret, const vec256 a, const vec256 b) { mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } static inline void sqr_fr(vec256 ret, const vec256 a) { sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } #ifdef __OPTIMIZE_SIZE__ void blst_fr_pentaroot(vec256 out, const vec256 inp) { static const byte pow[] = { TO_BYTES(0x33333332cccccccd), TO_BYTES(0x217f0e679998f199), TO_BYTES(0xe14a56699d73f002), TO_BYTES(0x2e5f0fbadd72321c) }; size_t pow_bits = 254; vec256 ret; vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ --pow_bits; /* most significant bit is set, skip over */ while (pow_bits--) { sqr_fr(ret, ret); if (is_bit_set(pow, pow_bits)) mul_fr(ret, ret, inp); } vec_copy(out, ret, sizeof(ret)); /* out = ret */ } #else # if 0 /* * "255"-bit variant omits full reductions at the ends of squarings, * not implemented yet[?]. */ static inline void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, const vec256 b) { sqr_n_mul_mont_255(out, a, count, BLS12_381_r, r0, b); } # else static void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, const vec256 b) { do { sqr_fr(out, a); a = out; } while (--count); mul_fr(out, out, b); } # endif # define sqr(ret,a) sqr_fr(ret,a) # define mul(ret,a,b) mul_fr(ret,a,b) # define sqr_n_mul(ret,a,n,b) sqr_n_mul_fr(ret,a,n,b) # include "pentaroot-addchain.h" void blst_fr_pentaroot(vec256 out, const vec256 inp) { PENTAROOT_MOD_BLS12_381_r(out, inp, vec256); } # undef PENTAROOT_MOD_BLS12_381_r # undef sqr_n_mul # undef sqr # undef mul #endif void blst_fr_pentapow(vec256 out, const vec256 inp) { vec256 tmp; sqr_fr(tmp, inp); sqr_fr(tmp, tmp); mul_fr(out, tmp, inp); } ================================================ FILE: src/point.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_381_ASM_POINT_H__ #define __BLS12_381_ASM_POINT_H__ #include "vect.h" #include "bytes.h" #define DECLARE_POINT(ptype, bits) \ typedef struct { vec##bits X,Y,Z; } ptype; \ typedef struct { vec##bits X,Y; } ptype##_affine; \ \ static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ const vec##bits a4); \ static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ const ptype##_affine *p2); \ static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ static void ptype##_add_affine(ptype *out, const ptype *p1, \ const ptype##_affine *p2); \ static void ptype##_double(ptype *out, const ptype *p1); \ static void ptype##_mult_w5(ptype *out, const ptype *point, \ const byte *scalar, size_t nbits); \ static void ptype##_cneg(ptype *p, limb_t cbit); \ static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ \ static inline void ptype##_cswap(ptype *restrict a, \ ptype *restrict b, bool_t cbit) { \ vec_cswap(a, b, sizeof(ptype), cbit); \ } \ static inline void ptype##_ccopy(ptype *restrict a, \ const ptype *restrict b, bool_t cbit) {\ vec_select(a, b, a, sizeof(ptype), cbit); \ } #define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ typedef struct { vec##bits X,Z; } ptype##xz; \ \ static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ const ptype##xz *p); \ static void ptype##xz_ladder_post(ptype *ret, \ const ptype##xz *r, const ptype##xz *s, \ const ptype##xz *p, const vec##bits Y1);\ \ static inline void ptype##xz_cswap(ptype##xz *restrict a, \ ptype##xz *restrict b, bool_t cbit) {\ vec_cswap(a, b, sizeof(ptype##xz), cbit); \ } DECLARE_POINT(POINTonE1, 384) DECLARE_POINT(POINTonE2, 384x) #ifdef __GNUC__ # pragma GCC diagnostic ignored "-Wunused-function" #endif #endif ================================================ FILE: src/rb_tree.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include /* * Red-black tree tailored for uniqueness test. Amount of messages to be * checked is known prior context initialization, implementation is * insert-only, failure is returned if message is already in the tree. */ struct node { struct node *leafs[2]; const void *data; size_t len_n_colour; /* len<<1 | colour */ }; struct rb_tree { struct node *root; size_t n_nodes; struct node nodes[1]; }; static long bytes_compare(const unsigned char *ptr0, size_t len0, const unsigned char *ptr1, size_t len1) { size_t i, len = len0len_n_colour &= ~(size_t)1) #define PAINT_RED(p) ((p)->len_n_colour |= 1) #define IS_RED(p) ((p)->len_n_colour & 1) static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len) { struct node *nodes[8*sizeof(void *)]; /* visited nodes */ unsigned char dirs[8*sizeof(void *)]; /* taken directions */ size_t k = 0; /* walked distance */ struct node *p, *y, *z; for (p = tree->root; p != NULL; k++) { long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1); if (cmp == 0) return 0; /* already in tree, no insertion */ /* record the step */ nodes[k] = p; p = p->leafs[(dirs[k] = cmp>0)]; } /* allocate new node */ z = &tree->nodes[tree->n_nodes++]; z->leafs[0] = z->leafs[1] = NULL; z->data = data; z->len_n_colour = len<<1; PAINT_RED(z); /* graft |z| */ if (k > 0) nodes[k-1]->leafs[dirs[k-1]] = z; else tree->root = z; /* re-balance |tree| */ while (k >= 2 && IS_RED(y = nodes[k-1])) { size_t ydir = dirs[k-2]; struct node *x = nodes[k-2], /* |z|'s grandparent */ *s = x->leafs[ydir^1]; /* |z|'s uncle */ if (s != NULL && IS_RED(s)) { PAINT_RED(x); PAINT_BLACK(y); PAINT_BLACK(s); k -= 2; } else { if (dirs[k-1] != ydir) { /* | | * x x * / \ \ * y s -> z s * \ / * z y * / \ * ? ? */ struct node *t = y; y = y->leafs[ydir^1]; t->leafs[ydir^1] = y->leafs[ydir]; y->leafs[ydir] = t; } /* | | * x y * \ / \ * y s -> z x * / \ / \ * z ? ? s */ x->leafs[ydir] = y->leafs[ydir^1]; y->leafs[ydir^1] = x; PAINT_RED(x); PAINT_BLACK(y); if (k > 2) nodes[k-3]->leafs[dirs[k-3]] = y; else tree->root = y; break; } } PAINT_BLACK(tree->root); return 1; } #undef IS_RED #undef PAINT_RED #undef PAINT_BLACK size_t blst_uniq_sizeof(size_t n_nodes) { return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1); } void blst_uniq_init(struct rb_tree *tree) { tree->root = NULL; tree->n_nodes = 0; } int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len) { return (int)rb_tree_insert(tree, data, len); } ================================================ FILE: src/recip-addchain.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ /* * The "magic" number is BLS12_381_P-2. Exponentiation to which yields * reciprocal to input base. * * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' * https://github.com/kwantam/addchain * * # Bos-Coster (win=4) : 461 (16) <<< * # Bos-Coster (win=3) : 464 ( 9) * # Bos-Coster (win=8) : 469 (35) * # Bos-Coster (win=5) : 463 (28) * # Bos-Coster (win=9) : 467 (32) * # Bos-Coster (win=7) : 462 (27) * # Yacobi : 481 (31) * # Bos-Coster (win=10) : 475 (30) * # Bos-Coster (win=6) : 463 (32) * # Bos-Coster (win=2) : 489 ( 5) * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) */ #define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ ptype t[16]; \ vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ sqr(t[0], t[1]); /* 1: 2 */\ mul(t[9], t[0], t[1]); /* 2: 3 */\ sqr(t[5], t[0]); /* 3: 4 */\ mul(t[2], t[9], t[0]); /* 4: 5 */\ mul(t[7], t[5], t[9]); /* 5: 7 */\ mul(t[10], t[2], t[5]); /* 6: 9 */\ mul(t[13], t[7], t[5]); /* 7: b */\ mul(t[4], t[10], t[5]); /* 8: d */\ mul(t[8], t[13], t[5]); /* 9: f */\ mul(t[15], t[4], t[5]); /* 10: 11 */\ mul(t[11], t[8], t[5]); /* 11: 13 */\ mul(t[3], t[15], t[5]); /* 12: 15 */\ mul(t[12], t[11], t[5]); /* 13: 17 */\ sqr(t[0], t[4]); /* 14: 1a */\ mul(t[14], t[12], t[5]); /* 15: 1b */\ mul(t[6], t[0], t[9]); /* 16: 1d */\ mul(t[5], t[0], t[2]); /* 17: 1f */\ /* sqr(t[0], t[0]); */ /* 18: 34 */\ /* sqr(t[0], t[0]); */ /* 19: 68 */\ /* sqr(t[0], t[0]); */ /* 20: d0 */\ /* sqr(t[0], t[0]); */ /* 21: 1a0 */\ /* sqr(t[0], t[0]); */ /* 22: 340 */\ /* sqr(t[0], t[0]); */ /* 23: 680 */\ /* sqr(t[0], t[0]); */ /* 24: d00 */\ /* sqr(t[0], t[0]); */ /* 25: 1a00 */\ /* sqr(t[0], t[0]); */ /* 26: 3400 */\ /* sqr(t[0], t[0]); */ /* 27: 6800 */\ /* sqr(t[0], t[0]); */ /* 28: d000 */\ /* sqr(t[0], t[0]); */ /* 29: 1a000 */\ sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ /* sqr(t[0], t[0]); */ /* 31: 34022 */\ /* sqr(t[0], t[0]); */ /* 32: 68044 */\ /* sqr(t[0], t[0]); */ /* 33: d0088 */\ /* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ /* sqr(t[0], t[0]); */ /* 35: 340220 */\ /* sqr(t[0], t[0]); */ /* 36: 680440 */\ /* sqr(t[0], t[0]); */ /* 37: d00880 */\ sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ /* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ /* sqr(t[0], t[0]); */ /* 40: 340223c */\ /* sqr(t[0], t[0]); */ /* 41: 6804478 */\ /* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ /* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ /* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ /* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ /* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ /* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ /* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ /* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ /* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ /* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ /* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ /* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ /* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ /* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ /* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ /* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ /* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ /* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ /* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ /* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ /* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ /* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ /* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ /* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ /* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ /* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ /* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ /* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ /* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ /* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ /* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ /* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ /* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ /* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ /* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ /* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ /* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ /* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ /* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ /* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ /* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ /* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ /* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ /* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ /* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ /* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ /* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ /* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ /* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ /* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ /* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ /* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ /* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ /* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ /* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ /* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ /* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ /* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ /* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ /* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ /* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ /* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ /* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ /* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ /* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ /* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ /* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ /* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ /* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ /* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ /* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ /* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ /* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ /* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ /* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ /* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ /* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ /* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ /* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ /* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ /* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ /* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ /* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ /* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ /* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ /* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ /* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ /* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ /* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ /* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ /* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ /* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ /* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ /* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ /* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ /* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ /* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ /* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ /* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ /* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ /* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ /* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ /* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ /* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ /* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ /* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ /* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ /* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ /* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ /* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ /* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ /* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ /* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ /* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ /* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ /* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ /* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ /* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ /* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ /* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ /* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ /* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ /* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ /* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ /* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ /* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ /* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ /* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ /* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ /* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ /* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ /* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ /* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ /* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ /* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ /* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ /* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ /* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ /* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ /* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ /* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ /* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ /* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ /* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ /* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ /* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ /* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ /* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ /* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ /* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ /* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ /* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ /* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ /* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ /* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ /* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ /* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ /* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ /* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ /* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ /* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ /* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ /* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ /* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ /* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ /* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ /* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ /* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ /* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ /* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ /* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ /* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ /* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ /* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ /* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ /* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ /* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ /* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ /* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ /* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ /* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ /* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ /* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ /* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ /* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ /* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ /* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ /* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ /* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ /* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ /* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ /* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ /* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ /* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ /* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ /* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ /* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ /* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ /* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ /* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ /* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ /* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ /* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ /* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ /* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ /* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ /* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ /* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ /* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ /* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ /* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ /* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ /* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ /* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ /* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ /* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ /* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ /* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ /* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ /* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ /* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ /* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ /* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ /* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ /* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ /* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ /* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ /* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ /* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ /* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ /* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ /* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ /* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ /* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ /* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ /* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ /* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ /* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ /* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ /* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ /* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ /* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ /* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ /* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ /* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ /* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ /* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ /* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ /* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ /* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ /* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ /* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ /* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ /* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ /* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ /* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ /* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ /* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ /* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ /* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ /* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ /* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ /* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ /* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ /* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ /* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ /* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ /* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ /* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ /* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ /* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ /* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ /* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ /* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ /* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ /* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ /* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ /* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ /* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ /* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ /* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ /* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ /* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ /* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ /* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ /* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ /* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ /* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ /* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ /* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ /* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ /* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ /* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ /* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ /* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ /* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ /* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ /* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ /* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ /* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ /* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ /* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ /* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ /* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ /* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ /* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ /* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ /* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ /* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ /* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ /* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ /* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ /* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ /* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ /* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ /* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ /* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ /* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ /* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ /* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ /* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ /* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ /* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ /* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ /* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ /* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ /* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ /* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ /* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ /* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ /* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ /* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ /* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ /* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ /* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ /* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ /* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ /* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ /* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ /* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ /* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ /* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ /* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ /* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ /* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ /* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ /* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ /* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ /* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ /* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ /* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ /* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ /* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ /* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ } while(0) ================================================ FILE: src/recip.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "fields.h" #ifdef __OPTIMIZE_SIZE__ /* * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% * more than corresponding optimal addition-chain, plus mispredicted * branch penalties on top of that... The addition chain below was * measured to be >50% faster. */ static void flt_reciprocal_fp(vec384 out, const vec384 inp) { static const byte BLS12_381_P_minus_2[] = { TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff), TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf), TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a) }; exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); } #else # define sqr(ret,a) sqr_fp(ret,a) # define mul(ret,a,b) mul_fp(ret,a,b) # define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) # include "recip-addchain.h" static void flt_reciprocal_fp(vec384 out, const vec384 inp) { RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); } # undef RECIPROCAL_MOD_BLS12_381_P # undef sqr_n_mul # undef mul # undef sqr #endif static void flt_reciprocal_fp2(vec384x out, const vec384x inp) { vec384 t0, t1; /* * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i */ sqr_fp(t0, inp[0]); sqr_fp(t1, inp[1]); add_fp(t0, t0, t1); flt_reciprocal_fp(t1, t0); mul_fp(out[0], inp[0], t1); mul_fp(out[1], inp[1], t1); neg_fp(out[1], out[1]); } static void reciprocal_fp(vec384 out, const vec384 inp) { static const vec384 Px8 = { /* left-aligned value of the modulus */ TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd), TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb), TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2) }; union { vec768 x; vec384 r[2]; } temp; ct_inverse_mod_384(temp.x, inp, BLS12_381_P, Px8); redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0); mul_mont_384(temp.r[0], temp.r[0], BLS12_381_RR, BLS12_381_P, p0); #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION /* sign goes straight to flt_reciprocal */ mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0); if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) | vec_is_zero(temp.r[1], sizeof(vec384))) vec_copy(out, temp.r[0], sizeof(vec384)); else flt_reciprocal_fp(out, inp); #else vec_copy(out, temp.r[0], sizeof(vec384)); #endif } void blst_fp_inverse(vec384 out, const vec384 inp) { reciprocal_fp(out, inp); } void blst_fp_eucl_inverse(vec384 ret, const vec384 a) { reciprocal_fp(ret, a); } static void reciprocal_fp2(vec384x out, const vec384x inp) { vec384 t0, t1; /* * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i */ sqr_fp(t0, inp[0]); sqr_fp(t1, inp[1]); add_fp(t0, t0, t1); reciprocal_fp(t1, t0); mul_fp(out[0], inp[0], t1); mul_fp(out[1], inp[1], t1); neg_fp(out[1], out[1]); } void blst_fp2_inverse(vec384x out, const vec384x inp) { reciprocal_fp2(out, inp); } void blst_fp2_eucl_inverse(vec384x out, const vec384x inp) { reciprocal_fp2(out, inp); } static void reciprocal_fr(vec256 out, const vec256 inp) { static const vec256 rx2 = { /* left-aligned value of the modulus */ TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), }; vec512 temp; ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2); redc_mont_256(out, temp, BLS12_381_r, r0); mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); } void blst_fr_inverse(vec256 out, const vec256 inp) { reciprocal_fr(out, inp); } void blst_fr_eucl_inverse(vec256 out, const vec256 inp) { reciprocal_fr(out, inp); } ================================================ FILE: src/server.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "keygen.c" #include "hash_to_field.c" #include "e1.c" #include "map_to_g1.c" #include "e2.c" #include "map_to_g2.c" #include "fp12_tower.c" #include "pairing.c" #include "aggregate.c" #include "exp.c" #include "sqrt.c" #include "recip.c" #include "bulk_addition.c" #include "multi_scalar.c" #include "consts.c" #include "vect.c" #include "exports.c" #ifndef __BLST_CGO__ # include "rb_tree.c" #endif #ifdef BLST_FR_PENTAROOT # include "pentaroot.c" #endif #ifndef __BLST_NO_CPUID__ # include "cpuid.c" #endif ================================================ FILE: src/sha256.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_381_ASM_SHA256_H__ #define __BLS12_381_ASM_SHA256_H__ #include "vect.h" #if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \ defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__) # define sha256_block_data_order blst_sha256_block_data_order_shaext #elif defined(__aarch64__) && \ defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__) # define sha256_block_data_order blst_sha256_block_armv8 #else # define sha256_block_data_order blst_sha256_block_data_order #endif #define sha256_hcopy blst_sha256_hcopy #define sha256_bcopy blst_sha256_bcopy #define sha256_emit blst_sha256_emit void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); void sha256_bcopy(void *dst, const void *src, size_t len); /* * If SHA256_CTX conflicts with something, just redefine it to alternative * custom name prior including this header. */ typedef struct { unsigned int h[8]; unsigned long long N; unsigned char buf[64]; size_t off; } SHA256_CTX; static void sha256_init_h(unsigned int h[8]) { h[0] = 0x6a09e667U; h[1] = 0xbb67ae85U; h[2] = 0x3c6ef372U; h[3] = 0xa54ff53aU; h[4] = 0x510e527fU; h[5] = 0x9b05688cU; h[6] = 0x1f83d9abU; h[7] = 0x5be0cd19U; } static void sha256_init(SHA256_CTX *ctx) { sha256_init_h(ctx->h); ctx->N = 0; vec_zero(ctx->buf, sizeof(ctx->buf)); ctx->off = 0; } static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) { size_t n; const unsigned char *inp = _inp; ctx->N += len; if ((len != 0) & ((n = ctx->off) != 0)) { size_t rem = sizeof(ctx->buf) - n; if (rem > len) { sha256_bcopy(ctx->buf + n, inp, len); ctx->off += len; return; } else { sha256_bcopy(ctx->buf + n, inp, rem); inp += rem; len -= rem; sha256_block_data_order(ctx->h, ctx->buf, 1); vec_zero(ctx->buf, sizeof(ctx->buf)); ctx->off = 0; } } n = len / sizeof(ctx->buf); if (n > 0) { sha256_block_data_order(ctx->h, inp, n); n *= sizeof(ctx->buf); inp += n; len -= n; } if (len) sha256_bcopy(ctx->buf, inp, ctx->off = len); } #define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ (ptr)[1] = (unsigned char)((val)>>16), \ (ptr)[2] = (unsigned char)((val)>>8), \ (ptr)[3] = (unsigned char)(val)) #if 1 void sha256_emit(unsigned char md[32], const unsigned int h[8]); #else static void sha256_emit(unsigned char md[32], const unsigned int h[8]) { unsigned int h_i; h_i = h[0]; __TOBE32(md + 0, h_i); h_i = h[1]; __TOBE32(md + 4, h_i); h_i = h[2]; __TOBE32(md + 8, h_i); h_i = h[3]; __TOBE32(md + 12, h_i); h_i = h[4]; __TOBE32(md + 16, h_i); h_i = h[5]; __TOBE32(md + 20, h_i); h_i = h[6]; __TOBE32(md + 24, h_i); h_i = h[7]; __TOBE32(md + 28, h_i); } #endif static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) { unsigned long long bits = ctx->N * 8; size_t n = ctx->off; unsigned char *tail; ctx->buf[n++] = 0x80; if (n > (sizeof(ctx->buf) - 8)) { sha256_block_data_order(ctx->h, ctx->buf, 1); vec_zero(ctx->buf, sizeof(ctx->buf)); } tail = ctx->buf + sizeof(ctx->buf) - 8; __TOBE32(tail, (unsigned int)(bits >> 32)); __TOBE32(tail + 4, (unsigned int)bits); sha256_block_data_order(ctx->h, ctx->buf, 1); sha256_emit(md, ctx->h); } #undef __TOBE32 #endif ================================================ FILE: src/sqrt-addchain.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ /* * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which * yields reciprocal of sqrt(x), which is used in simplified Shallue- * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) * as 'x*ret^2==1'). * * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' * https://github.com/kwantam/addchain * * # Bos-Coster (win=4) : 458 (16) <<< * # Bos-Coster (win=5) : 460 (28) * # Bos-Coster (win=6) : 461 (33) * # Bos-Coster (win=7) : 460 (28) * # Bos-Coster (win=3) : 462 ( 9) * # Bos-Coster (win=8) : 466 (34) * # Bos-Coster (win=9) : 464 (31) * # Yacobi : 478 (31) * # Bos-Coster (win=10) : 473 (30) * # Bos-Coster (win=2) : 486 ( 5) * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) */ #define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ ptype t[16]; \ vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ sqr(t[0], t[13]); /* 1: 2 */\ mul(t[8], t[0], t[13]); /* 2: 3 */\ sqr(t[4], t[0]); /* 3: 4 */\ mul(t[1], t[8], t[0]); /* 4: 5 */\ mul(t[6], t[4], t[8]); /* 5: 7 */\ mul(t[9], t[1], t[4]); /* 6: 9 */\ mul(t[12], t[6], t[4]); /* 7: b */\ mul(t[3], t[9], t[4]); /* 8: d */\ mul(t[7], t[12], t[4]); /* 9: f */\ mul(t[15], t[3], t[4]); /* 10: 11 */\ mul(t[10], t[7], t[4]); /* 11: 13 */\ mul(t[2], t[15], t[4]); /* 12: 15 */\ mul(t[11], t[10], t[4]); /* 13: 17 */\ sqr(t[0], t[3]); /* 14: 1a */\ mul(t[14], t[11], t[4]); /* 15: 1b */\ mul(t[5], t[0], t[8]); /* 16: 1d */\ mul(t[4], t[0], t[1]); /* 17: 1f */\ /* sqr(t[0], t[0]); */ /* 18: 34 */\ /* sqr(t[0], t[0]); */ /* 19: 68 */\ /* sqr(t[0], t[0]); */ /* 20: d0 */\ /* sqr(t[0], t[0]); */ /* 21: 1a0 */\ /* sqr(t[0], t[0]); */ /* 22: 340 */\ /* sqr(t[0], t[0]); */ /* 23: 680 */\ /* sqr(t[0], t[0]); */ /* 24: d00 */\ /* sqr(t[0], t[0]); */ /* 25: 1a00 */\ /* sqr(t[0], t[0]); */ /* 26: 3400 */\ /* sqr(t[0], t[0]); */ /* 27: 6800 */\ /* sqr(t[0], t[0]); */ /* 28: d000 */\ /* sqr(t[0], t[0]); */ /* 29: 1a000 */\ sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ /* sqr(t[0], t[0]); */ /* 31: 34022 */\ /* sqr(t[0], t[0]); */ /* 32: 68044 */\ /* sqr(t[0], t[0]); */ /* 33: d0088 */\ /* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ /* sqr(t[0], t[0]); */ /* 35: 340220 */\ /* sqr(t[0], t[0]); */ /* 36: 680440 */\ /* sqr(t[0], t[0]); */ /* 37: d00880 */\ sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ /* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ /* sqr(t[0], t[0]); */ /* 40: 340223c */\ /* sqr(t[0], t[0]); */ /* 41: 6804478 */\ /* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ /* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ /* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ /* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ /* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ /* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ /* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ /* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ /* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ /* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ /* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ /* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ /* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ /* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ /* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ /* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ /* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ /* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ /* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ /* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ /* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ /* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ /* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ /* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ /* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ /* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ /* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ /* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ /* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ /* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ /* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ /* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ /* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ /* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ /* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ /* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ /* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ /* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ /* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ /* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ /* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ /* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ /* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ /* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ /* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ /* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ /* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ /* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ /* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ /* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ /* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ /* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ /* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ /* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ /* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ /* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ /* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ /* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ /* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ /* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ /* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ /* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ /* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ /* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ /* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ /* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ /* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ /* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ /* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ /* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ /* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ /* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ /* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ /* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ /* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ /* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ /* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ /* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ /* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ /* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ /* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ /* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ /* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ /* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ /* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ /* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ /* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ /* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ /* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ /* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ /* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ /* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ /* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ /* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ /* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ /* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ /* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ /* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ /* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ /* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ /* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ /* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ /* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ /* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ /* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ /* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ /* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ /* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ /* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ /* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ /* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ /* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ /* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ /* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ /* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ /* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ /* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ /* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ /* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ /* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ /* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ /* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ /* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ /* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ /* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ /* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ /* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ /* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ /* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ /* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ /* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ /* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ /* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ /* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ /* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ /* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ /* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ /* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ /* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ /* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ /* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ /* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ /* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ /* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ /* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ /* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ /* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ /* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ /* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ /* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ /* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ /* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ /* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ /* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ /* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ /* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ /* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ /* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ /* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ /* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ /* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ /* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ /* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ /* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ /* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ /* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ /* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ /* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ /* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ /* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ /* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ /* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ /* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ /* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ /* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ /* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ /* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ /* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ /* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ /* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ /* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ /* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ /* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ /* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ /* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ /* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ /* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ /* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ /* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ /* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ /* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ /* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ /* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ /* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ /* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ /* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ /* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ /* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ /* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ /* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ /* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ /* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ /* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ /* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ /* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ /* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ /* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ /* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ /* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ /* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ /* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ /* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ /* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ /* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ /* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ /* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ /* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ /* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ /* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ /* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ /* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ /* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ /* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ /* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ /* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ /* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ /* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ /* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ /* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ /* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ /* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ /* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ /* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ /* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ /* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ /* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ /* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ /* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ /* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ /* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ /* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ /* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ /* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ /* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ /* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ /* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ /* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ /* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ /* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ /* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ /* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ /* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ /* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ /* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ /* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ /* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ /* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ /* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ /* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ /* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ /* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ /* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ /* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ /* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ /* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ /* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ /* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ /* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ /* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ /* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ /* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ /* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ /* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ /* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ /* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ /* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ /* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ /* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ /* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ /* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ /* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ /* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ /* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ /* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ /* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ /* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ /* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ /* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ /* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ /* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ /* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ /* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ /* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ /* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ /* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ /* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ /* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ /* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ /* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ /* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ /* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ /* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ /* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ /* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ /* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ /* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ /* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ /* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ /* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ /* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ /* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ /* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ /* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ /* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ /* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ /* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ /* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ /* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ /* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ /* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ /* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ /* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ /* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ /* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ /* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ /* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ /* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ /* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ /* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ /* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ /* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ /* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ /* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ /* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ /* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ /* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ /* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ /* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ /* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ /* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ /* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ /* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ /* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ /* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ /* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ /* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ /* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ /* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ /* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ /* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ /* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ } while(0) ================================================ FILE: src/sqrt.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "fields.h" #ifdef __OPTIMIZE_SIZE__ static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) { static const byte BLS_12_381_P_minus_3_div_4[] = { TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff), TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af), TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6) }; exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); } #else # if 1 /* * "383"-bit variant omits full reductions at the ends of squarings, * which results in up to ~15% improvement. [One can improve further * by omitting full reductions even after multiplications and * performing final reduction at the very end of the chain.] */ static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, const vec384 b) { sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } # else static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, const vec384 b) { while(count--) { sqr_fp(out, a); a = out; } mul_fp(out, out, b); } # endif # define sqr(ret,a) sqr_fp(ret,a) # define mul(ret,a,b) mul_fp(ret,a,b) # define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) # include "sqrt-addchain.h" static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) { RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); } # undef RECIP_SQRT_MOD_BLS12_381_P # undef sqr_n_mul # undef sqr # undef mul #endif static bool_t recip_sqrt_fp(vec384 out, const vec384 inp) { vec384 t0, t1; bool_t ret; recip_sqrt_fp_3mod4(t0, inp); mul_fp(t1, t0, inp); sqr_fp(t1, t1); ret = vec_is_equal(t1, inp, sizeof(t1)); vec_copy(out, t0, sizeof(t0)); return ret; } static bool_t sqrt_fp(vec384 out, const vec384 inp) { vec384 t0, t1; bool_t ret; recip_sqrt_fp_3mod4(t0, inp); mul_fp(t0, t0, inp); sqr_fp(t1, t0); ret = vec_is_equal(t1, inp, sizeof(t1)); vec_copy(out, t0, sizeof(t0)); return ret; } int blst_fp_sqrt(vec384 out, const vec384 inp) { return (int)sqrt_fp(out, inp); } int blst_fp_is_square(const vec384 inp) { return (int)ct_is_square_mod_384(inp, BLS12_381_P); } static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, const vec384x sqrt, const vec384x inp) { static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; static const vec384x sqrt_sqrt_minus_1 = { /* * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, * but it pivots into "complex" plane nevertheless... */ { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }; static const vec384x sqrt_minus_sqrt_minus_1 = { { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }; vec384x coeff, t0, t1; bool_t is_sqrt, flag; /* * Instead of multiple trial squarings we can perform just one * and see if the result is "rotated by multiple of 90°" in * relation to |inp|, and "rotate" |ret| accordingly. */ sqr_fp2(t0, sqrt); /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ sub_fp2(t1, t0, inp); is_sqrt = vec_is_zero(t1, sizeof(t1)); vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ add_fp2(t1, t0, inp); vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), flag = vec_is_zero(t1, sizeof(t1))); is_sqrt |= flag; /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ sub_fp(t1[0], t0[0], inp[1]); add_fp(t1[1], t0[1], inp[0]); vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), flag = vec_is_zero(t1, sizeof(t1))); is_sqrt |= flag; /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ add_fp(t1[0], t0[0], inp[1]); sub_fp(t1[1], t0[1], inp[0]); vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), flag = vec_is_zero(t1, sizeof(t1))); is_sqrt |= flag; /* actual "rotation" */ mul_fp2(out, ret, coeff); return is_sqrt; } /* * |inp| = a + b*i */ static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, const vec384x recip_ZZZ, const vec384x magic_ZZZ) { vec384 aa, bb, cc; vec384x inp_; bool_t is_sqrt; sqr_fp(aa, inp[0]); sqr_fp(bb, inp[1]); add_fp(aa, aa, bb); is_sqrt = recip_sqrt_fp(cc, aa); /* 1/sqrt(a²+b²) */ /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ... */ mul_fp2(inp_, inp, recip_ZZZ); /* ... and adjust |aa| and |cc| accordingly */ { vec384 za, zc; mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²) */ mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4) */ vec_select(aa, aa, za, sizeof(aa), is_sqrt); vec_select(cc, cc, zc, sizeof(cc), is_sqrt); } vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt); mul_fp(aa, aa, cc); /* sqrt(a²+b²) */ sub_fp(bb, inp_[0], aa); add_fp(aa, inp_[0], aa); vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ /* if it says "no sqrt," final "align" will find right one... */ (void)recip_sqrt_fp(out[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ div_by_2_fp(out[1], inp_[1]); mul_fp(out[1], out[1], out[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ mul_fp(out[0], out[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ /* bound to succeed */ (void)sqrt_align_fp2(out, out, out, inp_); mul_fp(out[0], out[0], cc); /* inverse the result */ mul_fp(out[1], out[1], cc); neg_fp(out[1], out[1]); return is_sqrt; } static bool_t sqrt_fp2(vec384x out, const vec384x inp) { vec384x ret; vec384 aa, bb; sqr_fp(aa, inp[0]); sqr_fp(bb, inp[1]); add_fp(aa, aa, bb); /* don't pay attention to return value, final "align" will tell... */ (void)sqrt_fp(aa, aa); /* sqrt(a²+b²) */ sub_fp(bb, inp[0], aa); add_fp(aa, inp[0], aa); vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ /* if it says "no sqrt," final "align" will find right one... */ (void)recip_sqrt_fp(ret[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ div_by_2_fp(ret[1], inp[1]); mul_fp(ret[1], ret[1], ret[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ mul_fp(ret[0], ret[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ /* * Now see if |ret| is or can be made sqrt(|inp|)... */ return sqrt_align_fp2(out, ret, ret, inp); } int blst_fp2_sqrt(vec384x out, const vec384x inp) { return (int)sqrt_fp2(out, inp); } int blst_fp2_is_square(const vec384x inp) { vec384 aa, bb; sqr_fp(aa, inp[0]); sqr_fp(bb, inp[1]); add_fp(aa, aa, bb); return (int)ct_is_square_mod_384(aa, BLS12_381_P); } ================================================ FILE: src/vect.c ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "vect.h" #ifdef __BLST_NO_ASM__ # include "no_asm.h" #endif /* * Following are some reference C implementations to assist new * assembly modules development, as starting-point stand-ins and for * cross-checking. In order to "polyfil" specific subroutine redefine * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. */ #ifdef lshift_mod_384 inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n, const vec384 mod) { while(n--) add_mod_384(ret, a, a, mod), a = ret; } #endif #ifdef mul_by_8_mod_384 inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) { lshift_mod_384(ret, a, 3, mod); } #endif #ifdef mul_by_3_mod_384 inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) { vec384 t; add_mod_384(t, a, a, mod); add_mod_384(ret, t, a, mod); } #endif #ifdef mul_by_3_mod_384x inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) { mul_by_3_mod_384(ret[0], a[0], mod); mul_by_3_mod_384(ret[1], a[1], mod); } #endif #ifdef mul_by_8_mod_384x inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod) { mul_by_8_mod_384(ret[0], a[0], mod); mul_by_8_mod_384(ret[1], a[1], mod); } #endif #ifdef mul_by_1_plus_i_mod_384x inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 mod) { vec384 t; add_mod_384(t, a[0], a[1], mod); sub_mod_384(ret[0], a[0], a[1], mod); vec_copy(ret[1], t, sizeof(t)); } #endif #ifdef add_mod_384x inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 mod) { add_mod_384(ret[0], a[0], b[0], mod); add_mod_384(ret[1], a[1], b[1], mod); } #endif #ifdef sub_mod_384x inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 mod) { sub_mod_384(ret[0], a[0], b[0], mod); sub_mod_384(ret[1], a[1], b[1], mod); } #endif #ifdef lshift_mod_384x inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, const vec384 mod) { lshift_mod_384(ret[0], a[0], n, mod); lshift_mod_384(ret[1], a[1], n, mod); } #endif #if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 mod, limb_t n0) { vec768 t0, t1, t2; vec384 aa, bb; mul_384(t0, a[0], b[0]); mul_384(t1, a[1], b[1]); add_mod_384(aa, a[0], a[1], mod); add_mod_384(bb, b[0], b[1], mod); mul_384(t2, aa, bb); sub_mod_384x384(t2, t2, t0, mod); sub_mod_384x384(t2, t2, t1, mod); sub_mod_384x384(t0, t0, t1, mod); redc_mont_384(ret[0], t0, mod, n0); redc_mont_384(ret[1], t2, mod, n0); } #endif #if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) { vec384 t0, t1; add_mod_384(t0, a[0], a[1], mod); sub_mod_384(t1, a[0], a[1], mod); mul_mont_384(ret[1], a[0], a[1], mod, n0); add_mod_384(ret[1], ret[1], ret[1], mod); mul_mont_384(ret[0], t0, t1, mod, n0); } #endif limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi); limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); /* * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place. */ static void div_by_zz(limb_t val[]) { static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000), TO_LIMB_T(0xac45a4010001a402) }; size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]); limb_t d_lo, d_hi; d_lo = zz[zz_len - 2]; d_hi = zz[zz_len - 1]; for (loop = zz_len, zz_len--; loop--;) { limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi); (void)quot_rem_128(val + loop, zz, q); } /* remainder is in low half of val[], quotient is in high */ } /* * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place. */ static void div_by_z(limb_t val[]) { static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) }; size_t loop, z_len = sizeof(z)/sizeof(z[0]); limb_t d_lo, d_hi; d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2]; d_hi = z[z_len - 1]; for (loop = z_len, z_len--; loop--;) { limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi); (void)quot_rem_64(val + loop, z, q); } /* remainder is in low half of val[], quotient is in high */ } ================================================ FILE: src/vect.h ================================================ /* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __BLS12_381_ASM_VECT_H__ #define __BLS12_381_ASM_VECT_H__ #include #if defined(__x86_64__) || defined(__aarch64__) /* These are available even in ILP32 flavours, but even then they are * capable of performing 64-bit operations as efficiently as in *P64. */ typedef unsigned long long limb_t; # define LIMB_T_BITS 64 #elif defined(_WIN64) /* Win64 is P64 */ typedef unsigned __int64 limb_t; # define LIMB_T_BITS 64 #elif defined(__BLST_NO_ASM__) || defined(__wasm64__) typedef unsigned int limb_t; # define LIMB_T_BITS 32 # ifndef __BLST_NO_ASM__ # define __BLST_NO_ASM__ # endif #else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ typedef unsigned long limb_t; # ifdef _LP64 # define LIMB_T_BITS 64 # else # define LIMB_T_BITS 32 # define __BLST_NO_ASM__ # endif #endif /* * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor * knows nothing about sizeof(anything)... */ #if LIMB_T_BITS == 64 # define TO_LIMB_T(limb64) limb64 #else # define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) #endif #define NLIMBS(bits) (bits/LIMB_T_BITS) typedef limb_t vec256[NLIMBS(256)]; typedef limb_t vec512[NLIMBS(512)]; typedef limb_t vec384[NLIMBS(384)]; typedef limb_t vec768[NLIMBS(768)]; typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ typedef unsigned char byte; #define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ (byte)(limb64>>16),(byte)(limb64>>24),\ (byte)(limb64>>32),(byte)(limb64>>40),\ (byte)(limb64>>48),(byte)(limb64>>56) typedef byte pow256[256/8]; /* * Internal Boolean type, Boolean by value, hence safe to cast to or * reinterpret as 'bool'. */ typedef limb_t bool_t; /* * Assembly subroutines... */ #if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)\ && !defined(__BLST_NO_ASM__) # define mul_mont_sparse_256 mulx_mont_sparse_256 # define sqr_mont_sparse_256 sqrx_mont_sparse_256 # define from_mont_256 fromx_mont_256 # define redc_mont_256 redcx_mont_256 # define mul_mont_384 mulx_mont_384 # define sqr_mont_384 sqrx_mont_384 # define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 # define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 # define mul_384 mulx_384 # define sqr_384 sqrx_384 # define redc_mont_384 redcx_mont_384 # define from_mont_384 fromx_mont_384 # define sgn0_pty_mont_384 sgn0x_pty_mont_384 # define sgn0_pty_mont_384x sgn0x_pty_mont_384x # define ct_inverse_mod_384 ctx_inverse_mod_384 #endif void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p, limb_t n0); void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, const vec256 one); limb_t check_mod_256(const pow256 a, const vec256 p); limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, const vec256 p); limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, const vec256 p); void vec_prefetch(const void *ptr, size_t len); void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p, limb_t n0); void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, const vec384 p, limb_t n0, const vec384 b); void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, const vec384 p, limb_t n0, const vec384 b); void mul_384(vec768 ret, const vec384 a, const vec384 b); void sqr_384(vec768 ret, const vec384 a); void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); void ct_inverse_mod_384(vec768 ret, const vec384 inp, const vec384 mod, const vec384 modx); void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, const vec256 modx); bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); #if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) # define mul_mont_384x mulx_mont_384x # define sqr_mont_384x sqrx_mont_384x # define sqr_mont_382x sqrx_mont_382x # define mul_382x mulx_382x # define sqr_382x sqrx_382x #endif void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 p, limb_t n0); void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 p); void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 p); void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, const vec384 p); void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, const vec384 p); /* * C subroutines */ static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, size_t pow_bits, const vec384 p, limb_t n0); static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, size_t pow_bits, const vec384 p, limb_t n0); static void div_by_zz(limb_t val[]); static void div_by_z(limb_t val[]); #ifdef __UINTPTR_TYPE__ typedef __UINTPTR_TYPE__ uptr_t; #else typedef const void *uptr_t; #endif #if !defined(restrict) # if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 # if defined(__GNUC__) && __GNUC__>=2 # define restrict __restrict__ # elif defined(_MSC_VER) # define restrict __restrict # else # define restrict # endif # endif #endif #if !defined(inline) && !defined(__cplusplus) # if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 # if defined(__GNUC__) && __GNUC__>=2 # define inline __inline__ # elif defined(_MSC_VER) # define inline __inline # else # define inline # endif # endif #endif #if defined(__GNUC__) || defined(__clang__) # define launder(var) __asm__ __volatile__("" : "+r"(var)) #else # define launder(var) #endif static inline bool_t is_bit_set(const byte *v, size_t i) { bool_t ret = (v[i/8] >> (i%8)) & 1; launder(ret); return ret; } static inline bool_t byte_is_zero(unsigned char c) { limb_t ret = ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); launder(ret); return ret; } static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) { unsigned char acc; size_t i; for (acc = 0, i = 0; i < num; i++) acc |= a[i]; return byte_is_zero(acc); } static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, bool_t cbit) { limb_t ai, *ap = (limb_t *)a; limb_t bi, *bp = (limb_t *)b; limb_t xorm, mask; size_t i; launder(cbit); mask = (limb_t)0 - cbit; num /= sizeof(limb_t); for (i = 0; i < num; i++) { xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; ap[i] = ai ^ xorm; bp[i] = bi ^ xorm; } } /* ret = bit ? a : b */ void vec_select_32(void *ret, const void *a, const void *b, bool_t sel_a); void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); static inline void vec_select(void *ret, const void *a, const void *b, size_t num, bool_t sel_a) { launder(sel_a); #ifndef __BLST_NO_ASM__ if (num == 32) vec_select_32(ret, a, b, sel_a); else if (num == 48) vec_select_48(ret, a, b, sel_a); else if (num == 96) vec_select_96(ret, a, b, sel_a); else if (num == 144) vec_select_144(ret, a, b, sel_a); else if (num == 192) vec_select_192(ret, a, b, sel_a); else if (num == 288) vec_select_288(ret, a, b, sel_a); #else if (0) ; #endif else { limb_t bi; volatile limb_t *rp = (limb_t *)ret; const limb_t *ap = (const limb_t *)a; const limb_t *bp = (const limb_t *)b; limb_t xorm, mask = (limb_t)0 - sel_a; size_t i; num /= sizeof(limb_t); for (i = 0; i < num; i++) { xorm = (ap[i] ^ (bi = bp[i])) & mask; rp[i] = bi ^ xorm; } } } static inline bool_t is_zero(limb_t l) { limb_t ret = (~l & (l - 1)) >> (LIMB_T_BITS - 1); launder(ret); return ret; } static inline bool_t vec_is_zero(const void *a, size_t num) { const limb_t *ap = (const limb_t *)a; limb_t acc; size_t i; #ifndef __BLST_NO_ASM__ bool_t vec_is_zero_16x(const void *a, size_t num); if ((num & 15) == 0) return vec_is_zero_16x(a, num); #endif num /= sizeof(limb_t); for (acc = 0, i = 0; i < num; i++) acc |= ap[i]; return is_zero(acc); } static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) { const limb_t *ap = (const limb_t *)a; const limb_t *bp = (const limb_t *)b; limb_t acc; size_t i; #ifndef __BLST_NO_ASM__ bool_t vec_is_equal_16x(const void *a, const void *b, size_t num); if ((num & 15) == 0) return vec_is_equal_16x(a, b, num); #endif num /= sizeof(limb_t); for (acc = 0, i = 0; i < num; i++) acc |= ap[i] ^ bp[i]; return is_zero(acc); } static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, const vec384 p) { cneg_mod_384(ret[0], a[0], flag, p); cneg_mod_384(ret[1], a[1], flag, p); } static inline void vec_copy(void *restrict ret, const void *a, size_t num) { limb_t *rp = (limb_t *)ret; const limb_t *ap = (const limb_t *)a; size_t i; num /= sizeof(limb_t); for (i = 0; i < num; i++) rp[i] = ap[i]; } static inline void vec_zero(void *ret, size_t num) { volatile limb_t *rp = (volatile limb_t *)ret; size_t i; num /= sizeof(limb_t); for (i = 0; i < num; i++) rp[i] = 0; #if defined(__GNUC__) || defined(__clang__) __asm__ __volatile__("" : : "r"(ret) : "memory"); #endif } static inline void vec_czero(void *ret, size_t num, bool_t cbit) { limb_t *rp = (limb_t *)ret; size_t i; limb_t mask; launder(cbit); mask = (limb_t)0 - (cbit^1); num /= sizeof(limb_t); for (i = 0; i < num; i++) rp[i] &= mask; } /* * Some compilers get arguably overzealous(*) when passing pointer to * multi-dimensional array [such as vec384x] as 'const' argument. * General direction seems to be to legitimize such constification, * so it's argued that suppressing the warning is appropriate. * * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm */ #if defined(__INTEL_COMPILER) # pragma warning(disable:167) # pragma warning(disable:556) #elif defined(__GNUC__) && !defined(__clang__) && (__STDC_VERSION__-0) < 202311 # pragma GCC diagnostic ignored "-Wpedantic" #elif defined(_MSC_VER) # pragma warning(disable: 4127 4189) #endif #if !defined(__wasm__) && __STDC_HOSTED__-0 != 0 # include #endif #if defined(__GNUC__) # ifndef alloca # define alloca(s) __builtin_alloca(s) # endif #elif defined(__sun) # include #elif defined(_WIN32) # include # ifndef alloca # define alloca(s) _alloca(s) # endif #endif #endif /* __BLS12_381_ASM_VECT_H__ */