Repository: microsoft/lepton_jpeg_rust
Branch: main
Commit: 241d66906dd0
Files: 129
Total size: 19.1 MB

Directory structure:
gitextract_8bpxdk9j/

├── .cargo/
│   └── config.toml
├── .config/
│   ├── 1espt/
│   │   └── PipelineAutobaseliningConfig.yml
│   ├── guardian/
│   │   └── .gdnbaselines
│   └── nextest.toml
├── .github/
│   └── workflows/
│       ├── publish.yml
│       ├── publishwheels.yml
│       └── rust.yml
├── .gitignore
├── .vscode/
│   ├── launch.json
│   └── tasks.json
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Cargo.toml
├── DESIGN.md
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── SECURITY.md
├── azure-pipelines.yml
├── benches/
│   └── benches.rs
├── dll/
│   ├── Cargo.toml
│   └── src/
│       └── lib.rs
├── fuzz/
│   ├── .cargo/
│   │   └── config.toml
│   ├── .gitignore
│   ├── Cargo.toml
│   ├── fuzz_targets/
│   │   └── fuzz_target_1.rs
│   └── rust-toolchain.toml
├── images/
│   ├── android.lep
│   ├── androidcrop.lep
│   ├── androidcropoptions.lep
│   ├── androidprogressive.lep
│   ├── androidprogressive_garbage.lep
│   ├── androidtrail.lep
│   ├── cathedral_db_non_int.lep
│   ├── cathedral_db_non_int_rustold.lep
│   ├── colorswap.lep
│   ├── eof_and_trailinghdrdata.lep
│   ├── eof_and_trailingrst.lep
│   ├── gray2sf.lep
│   ├── grayscale.lep
│   ├── half_scan.lep
│   ├── half_scan_rust55.lep
│   ├── hq.lep
│   ├── iphone.lep
│   ├── iphonecity.lep
│   ├── iphonecity_with_16KGarbage.jpgoutput
│   ├── iphonecity_with_16KGarbage.lep
│   ├── iphonecity_with_16KGarbage.lepoutput
│   ├── iphonecity_with_1MGarbage.lep
│   ├── iphonecrop.lep
│   ├── iphonecrop2.lep
│   ├── iphoneprogressive.lep
│   ├── iphoneprogressive2.lep
│   ├── mathoverflow_16.lep
│   ├── mathoverflow_32.lep
│   ├── mathoverflow_scalar.lep
│   ├── narrowrst.lep
│   ├── nofsync.lep
│   ├── out_of_order_dqt.lep
│   ├── pixelated.lep
│   ├── progressive_late_dht.lep
│   ├── slrcity.lep
│   ├── slrhills.lep
│   ├── slrindoor.lep
│   ├── t.jpgoutput
│   ├── t.lep
│   ├── t.lepoutput
│   ├── tiny.lep
│   ├── trailingrst.lep
│   ├── trailingrst2.lep
│   ├── trailingrst_missing_in_jpg.lep
│   ├── trunc.lep
│   ├── truncate4.lep
│   ├── truncatedzerorun.lep
│   ├── truncbad.lep
│   └── zeros_in_dqt_tables.lep
├── lib/
│   ├── Cargo.toml
│   └── src/
│       ├── consts.rs
│       ├── enabled_features.rs
│       ├── helpers.rs
│       ├── jpeg/
│       │   ├── bit_reader.rs
│       │   ├── bit_writer.rs
│       │   ├── block_based_image.rs
│       │   ├── component_info.rs
│       │   ├── jpeg_code.rs
│       │   ├── jpeg_header.rs
│       │   ├── jpeg_position_state.rs
│       │   ├── jpeg_read.rs
│       │   ├── jpeg_write.rs
│       │   ├── mod.rs
│       │   ├── row_spec.rs
│       │   └── truncate_components.rs
│       ├── lepton_error.rs
│       ├── lib.rs
│       ├── metrics.rs
│       ├── micro_benchmark.rs
│       └── structs/
│           ├── block_context.rs
│           ├── branch.rs
│           ├── idct.rs
│           ├── lepton_decoder.rs
│           ├── lepton_encoder.rs
│           ├── lepton_file_reader.rs
│           ├── lepton_file_writer.rs
│           ├── lepton_header.rs
│           ├── mod.rs
│           ├── model.rs
│           ├── multiplexer.rs
│           ├── neighbor_summary.rs
│           ├── partial_buffer.rs
│           ├── probability_tables.rs
│           ├── quantization_tables.rs
│           ├── simple_hash.rs
│           ├── simple_threadpool.rs
│           ├── thread_handoff.rs
│           ├── vpx_bool_reader.rs
│           └── vpx_bool_writer.rs
├── package/
│   └── Lepton.Jpeg.Rust.nuspec
├── python/
│   ├── Cargo.toml
│   ├── README.md
│   ├── pyproject.toml
│   ├── src/
│   │   └── lib.rs
│   └── tests/
│       └── test_compress.py
├── rustfmt.toml
├── tests/
│   ├── end_to_end.rs
│   ├── verifycompression.cmd
│   └── verifydir.cmd
└── util/
    ├── Cargo.toml
    └── src/
        ├── main.rs
        └── verifydir.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .cargo/config.toml
================================================
# -Cehcont_guard: Enable EH Continuation Metadata (https://learn.microsoft.com/en-us/cpp/build/reference/guard-enable-eh-continuation-metadata).
# -Ccontrol-flow-guard: Enable Control Flow Guard, needed for OneBranch's post-build analysis (https://learn.microsoft.com/en-us/cpp/build/reference/guard-enable-control-flow-guard).
# -Ctarget-feature=+crt-static: Statically link the CRT (required to link the spectre-mitigated CRT).
[target.'cfg(target_os = "windows")']
rustflags = ["-Ccontrol-flow-guard", "-Ctarget-feature=+crt-static"]

# -Clink-args=/DYNAMICBASE /CETCOMPAT: Enable "shadow stack" (https://learn.microsoft.com/en-us/cpp/build/reference/cetcompat)
[target.'cfg(all(target_os = "windows", any(target_arch = "i686", target_arch = "x86_64")))']
rustflags = ["-Clink-arg=/DYNAMICBASE", "-Clink-arg=/CETCOMPAT"]

[registries]

[env]
WORKSPACE_ROOT = { value = "", relative = true }

================================================
FILE: .config/1espt/PipelineAutobaseliningConfig.yml
================================================
## DO NOT MODIFY THIS FILE MANUALLY. This is part of auto-baselining from 1ES Pipeline Templates. Go to [https://aka.ms/1espt-autobaselining] for more details.

pipelines:
  9128:
    retail:
      source:
        eslint:
          lastModifiedDate: 2026-01-27
        psscriptanalyzer:
          lastModifiedDate: 2026-01-27
        armory:
          lastModifiedDate: 2026-01-27
        accessibilityinsights:
          lastModifiedDate: 2026-01-27
      binary:
        binskim:
          lastModifiedDate: 2026-01-27
        spotbugs:
          lastModifiedDate: 2026-01-27


================================================
FILE: .config/guardian/.gdnbaselines
================================================
{
  "properties": {
    "helpUri": "https://eng.ms/docs/microsoft-security/security/azure-security/cloudai-security-fundamentals-engineering/security-integration/guardian-wiki/microsoft-guardian/general/baselines"
  },
  "version": "1.0.0",
  "baselines": {
    "default": {
      "name": "default",
      "createdDate": "2026-01-27 06:17:01Z",
      "lastUpdatedDate": "2026-01-27 06:17:01Z"
    }
  },
  "results": {
    "7888556c1e034138d9cf4c682b73c492a353bb423ec11dc085e477365358d5b5": {
      "signature": "7888556c1e034138d9cf4c682b73c492a353bb423ec11dc085e477365358d5b5",
      "alternativeSignatures": [
        "f363807b3905e1c28a3a5fb6ffcb9b91c52b725244e6a57f8d2f41e3b70fbc02"
      ],
      "target": "target/release/lepton_jpeg.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "82dae63401a13132179fdd7f28cb9935eb07f6808da120db385d4537f6c7e781": {
      "signature": "82dae63401a13132179fdd7f28cb9935eb07f6808da120db385d4537f6c7e781",
      "alternativeSignatures": [
        "03acf8c325bf5b05a5e79b1b417f263e3526c9f06cfbb942d7f9b740840f3516"
      ],
      "target": "target/release/lepton_jpeg_avx2.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "34d7aee5ca8519db58d0fa6d7ddf78e892c1fc7f8c905926a5fbc8c0cb080af0": {
      "signature": "34d7aee5ca8519db58d0fa6d7ddf78e892c1fc7f8c905926a5fbc8c0cb080af0",
      "alternativeSignatures": [
        "e2af46751ad6876b19468b60518f5e41adfd3dcd204794acb0ebc401d7733573"
      ],
      "target": "target/release/lepton_jpeg_python.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "c359baf101a3b163e44bb6577cb8492ed745c10c9f3bd182f8b935057bfd0bf2": {
      "signature": "c359baf101a3b163e44bb6577cb8492ed745c10c9f3bd182f8b935057bfd0bf2",
      "alternativeSignatures": [
        "4be395a1280a6d3ced21b0ad815572da5d13632d75ea6dcb76f4e9b6ca3753ab"
      ],
      "target": "target/release/lepton_jpeg_util.exe",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "ee719806d1996ca8d823f90d73903cc9f5da08a3960af2c0a1eb0dd1a54c99d0": {
      "signature": "ee719806d1996ca8d823f90d73903cc9f5da08a3960af2c0a1eb0dd1a54c99d0",
      "alternativeSignatures": [
        "ba93e14514af2d2f711daad10f312379d730bc77ef208901c9cf06e07bf01d84"
      ],
      "target": "target/release/lepton_jpeg_util_avx2.exe",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "cd6b9720c2019ba2c72ef61db8542b1ce290ba22d587eb3955ab755d14d6f4ed": {
      "signature": "cd6b9720c2019ba2c72ef61db8542b1ce290ba22d587eb3955ab755d14d6f4ed",
      "alternativeSignatures": [
        "1ed2e30b47b86b4ed9ca74beff17a7312994572936ef2333bbc6ef9bc17dacd3"
      ],
      "target": "target/release/deps/default_boxed_derive-63de623f2e43e138.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "363faeb4073c8417f384af5227032d0daa483689d56ad0fac59f3328e751b183": {
      "signature": "363faeb4073c8417f384af5227032d0daa483689d56ad0fac59f3328e751b183",
      "alternativeSignatures": [
        "0a863a85465581440a18aeb7b257b269e75c704621b9d7dced6ed89204d950f6"
      ],
      "target": "target/release/deps/default_boxed_derive-a563f94fdc621d90.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "12a8462656e61be1c0ef31be295c12ac6e85b261c25adee4cf97e2d32819b315": {
      "signature": "12a8462656e61be1c0ef31be295c12ac6e85b261c25adee4cf97e2d32819b315",
      "alternativeSignatures": [
        "29e6fbebe0ea89a075a6e11c87b2f776f2cada75471317093729862c1f4cd60d"
      ],
      "target": "target/release/deps/git_version_macro-81a02dafe74069f0.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "f9cb8789538579745e4a5881ebfe2de000c3a924099948e9df510235d8e63326": {
      "signature": "f9cb8789538579745e4a5881ebfe2de000c3a924099948e9df510235d8e63326",
      "alternativeSignatures": [
        "495638cb4ea6541aa8abbfb904b5bc968502f7c3d2eda51fc8009b842627cd5c"
      ],
      "target": "target/release/deps/git_version_macro-84dcd42a3b51464a.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "0015b41486ed72ac681952fc1e5117467edc31cc458a8ebe0bc2428040501432": {
      "signature": "0015b41486ed72ac681952fc1e5117467edc31cc458a8ebe0bc2428040501432",
      "alternativeSignatures": [
        "a14a0079e1028b5a3a85807dd85687ff60f1e614e10d6174e8f8d85d5ffad6f9"
      ],
      "target": "target/release/deps/indoc-0ef44e389295250a.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "54b6fbb3b25a55dd7124f9172228d97a8e208a2b2f310fb98f52120d2555e9a0": {
      "signature": "54b6fbb3b25a55dd7124f9172228d97a8e208a2b2f310fb98f52120d2555e9a0",
      "alternativeSignatures": [
        "5586663f943c7c47c916fba1ac930ea1eed0c8231090913f36b98ace3158a3e0"
      ],
      "target": "target/release/deps/indoc-1d196f6d756468dd.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "a571bdcd0bc935822e84ea12d95ffc432c5f99381b681b9cc292f7ed8ec32b06": {
      "signature": "a571bdcd0bc935822e84ea12d95ffc432c5f99381b681b9cc292f7ed8ec32b06",
      "alternativeSignatures": [
        "bcf8ae23a38f46f4b1fbad99d2a0d4c1b72c372bc8aedb5dd2d20543ad7c18da"
      ],
      "target": "target/release/deps/lepton_jpeg.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "4156fba8fbda87c312c40f8644080f0444e119c7ad022a7a1cd5bc153472c8ed": {
      "signature": "4156fba8fbda87c312c40f8644080f0444e119c7ad022a7a1cd5bc153472c8ed",
      "alternativeSignatures": [
        "867001d871648777be53eee5134a29693823d0c842475176bc0a738f448ab9d1"
      ],
      "target": "target/release/deps/lepton_jpeg_python.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "149befb9280c911d3f0b709dd16c3b495937aacddf5e0ab770addce09e6f7ab6": {
      "signature": "149befb9280c911d3f0b709dd16c3b495937aacddf5e0ab770addce09e6f7ab6",
      "alternativeSignatures": [
        "91f1eab664b17bcc61e3585f933ef033800549b55f938091f80a75c3e0e6958d"
      ],
      "target": "target/release/deps/lepton_jpeg_util.exe",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "903069c0ae93b67c9b06d4d760af5b12c07ce30db727be83f46507875fd1856c": {
      "signature": "903069c0ae93b67c9b06d4d760af5b12c07ce30db727be83f46507875fd1856c",
      "alternativeSignatures": [
        "8357b47447b55a4058ac2d9b7cbc881f412384528a90e66a88509fea88f974ff"
      ],
      "target": "target/release/deps/pyo3_macros-0883c55e16fc46dd.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "ef9d7c4fdf143066f64d95af451456f5547eba3e89156ba0cbda5dc92b9eed0c": {
      "signature": "ef9d7c4fdf143066f64d95af451456f5547eba3e89156ba0cbda5dc92b9eed0c",
      "alternativeSignatures": [
        "954a842978f48d0a377246cec186dcf2afd17423cae579f197e6d7e081657935"
      ],
      "target": "target/release/deps/pyo3_macros-ead3c9c3859ec605.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "544e0bdd76e7c149e17e870fb0b32aa8d35af936a96e12f73891061d25b64485": {
      "signature": "544e0bdd76e7c149e17e870fb0b32aa8d35af936a96e12f73891061d25b64485",
      "alternativeSignatures": [
        "e6f1ebffea3eb268804be426a462e535249f01dc46a9ef43dd1f2c7aa899bb96"
      ],
      "target": "target/release/deps/rustversion-60c5a5f87e0f5de5.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "3bd252ba75ede9998cae2e3283370157bef89d7b2de12562cb8ad3472f1ee620": {
      "signature": "3bd252ba75ede9998cae2e3283370157bef89d7b2de12562cb8ad3472f1ee620",
      "alternativeSignatures": [
        "ba5c335ba47831bfb30973ec0e423518ed2b198f8d8d691fd3f856a886a924c6"
      ],
      "target": "target/release/deps/rustversion-ae37400f19b3f17f.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "4cf9241015129625b09af906ce68f0424cafdf3661358bd4081e29d8e945024b": {
      "signature": "4cf9241015129625b09af906ce68f0424cafdf3661358bd4081e29d8e945024b",
      "alternativeSignatures": [
        "901556be5d52b05739e219d3911c9ca20b435456fe22d3e7d1fe994e18faa043"
      ],
      "target": "target/release/deps/time_macros-17f74972c8766b75.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "3ad2f2c6ad81bbb12175b4e299f09319a2c7e26e2e91386e8cdf2bb7f66bb6b4": {
      "signature": "3ad2f2c6ad81bbb12175b4e299f09319a2c7e26e2e91386e8cdf2bb7f66bb6b4",
      "alternativeSignatures": [
        "f55823c74a027e8a3e03b58709e244f185e585d3729015841df513f514964799"
      ],
      "target": "target/release/deps/time_macros-6674f0d9cd1fe1c0.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "51a37a91abff68ac93176d40a7ebe4254f8bd2d59b3f067eac8e63fbf344dc96": {
      "signature": "51a37a91abff68ac93176d40a7ebe4254f8bd2d59b3f067eac8e63fbf344dc96",
      "alternativeSignatures": [
        "2b069009ab0383fb0ff18c24cda15d0e3a2787753d56e1d062739be4d1f24ad6"
      ],
      "target": "target/release/deps/windows_implement-131e219ba8366c19.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "d08a22af13bbb9c4df920b594dc86a5191f4ab28301a3fda5a8c99b19909d39f": {
      "signature": "d08a22af13bbb9c4df920b594dc86a5191f4ab28301a3fda5a8c99b19909d39f",
      "alternativeSignatures": [
        "556ad3740e06bf41c0476ceabdabef20085c886ff93eafa7dac49a3f34bf216b"
      ],
      "target": "target/release/deps/windows_implement-9fc3e36e619e31c4.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "7c4b27fd4d70c9af21f0770b95de543c7c5b1c43b091d4e97628aa611981c936": {
      "signature": "7c4b27fd4d70c9af21f0770b95de543c7c5b1c43b091d4e97628aa611981c936",
      "alternativeSignatures": [
        "ec50c72748b2290ad8ad44fba6be6392c1a97b8b7a82cf40cfa02fbf8616a19d"
      ],
      "target": "target/release/deps/windows_interface-3fdabd2e927b6dbf.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    },
    "3f2ca4517e5e7715572fca9ef646b6c5738b01f2d1eddda8da452bcb35ace81b": {
      "signature": "3f2ca4517e5e7715572fca9ef646b6c5738b01f2d1eddda8da452bcb35ace81b",
      "alternativeSignatures": [
        "c7424dad4782f04900eea0009b9db79dc37ea5b18cb982fafba9135e2bb47aef"
      ],
      "target": "target/release/deps/windows_interface-d6a6711b2754ade3.dll",
      "memberOf": [
        "default"
      ],
      "tool": "binskim",
      "ruleId": "BA2007",
      "createdDate": "2026-01-27 06:17:01Z",
      "expirationDate": "2026-07-16 06:19:42Z",
      "justification": "This error is baselined with an expiration date of 180 days from 2026-01-27 06:19:42Z"
    }
  }
}

================================================
FILE: .config/nextest.toml
================================================
[profile.ci.junit]
path = "junit.xml"

================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish Crate

permissions:
  contents: read

on:
  push:
    tags:
      - "v*.*.*"  # Triggers only for version tag pushes

jobs:
  publish:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout code with full history
      uses: actions/checkout@v3
      with:
        fetch-depth: 0  # Needed to compare commits and access tag history

    - name: Ensure tag is at tip of main
      id: verify_tag_commit
      run: |
        echo "🔍 Verifying tag points to main branch tip..."
        git fetch origin main

        TAG_COMMIT=$(git rev-parse ${{ github.ref }})
        MAIN_COMMIT=$(git rev-parse origin/main)

        echo "Tag commit:  $TAG_COMMIT"
        echo "Main commit: $MAIN_COMMIT"

        if [ "$TAG_COMMIT" != "$MAIN_COMMIT" ]; then
          echo "❌ Tag is not at tip of main. Aborting."
          exit 1
        fi
        echo "✅ Tag is at tip of main."

    - name: Extract tag version
      id: tag_version
      run: |
        echo "TAG_VERSION=${GITHUB_REF#refs/tags/v}" >> "$GITHUB_OUTPUT"

    - name: Read version from Cargo.toml
      id: cargo_version
      run: |
        CARGO_VERSION=$(grep '^version\s*=' lib/Cargo.toml | head -1 | sed -E 's/version\s*=\s*"([^"]+)"/\1/')
        echo "CARGO_VERSION=$CARGO_VERSION" >> "$GITHUB_OUTPUT"

    - name: Check tag version matches Cargo.toml
      run: |
        echo "🔍 Comparing tag and Cargo.toml versions..."
        echo "Tag:          ${{ steps.tag_version.outputs.TAG_VERSION }}"
        echo "Cargo.toml:   ${{ steps.cargo_version.outputs.CARGO_VERSION }}"

        if [ "${{ steps.tag_version.outputs.TAG_VERSION }}" != "${{ steps.cargo_version.outputs.CARGO_VERSION }}" ]; then
          echo "❌ Version mismatch: tag does not match Cargo.toml"
          exit 1
        fi
        echo "✅ Tag version matches Cargo.toml."

    - name: Set up Rust
      uses: dtolnay/rust-toolchain@stable
      with:
        toolchain: stable

    - name: Publish to crates.io
      env:
        CARGO_REGISTRY_TOKEN: ${{ secrets.CRATE_PUBLISH }}
      run: cargo publish --verbose --package lepton_jpeg


================================================
FILE: .github/workflows/publishwheels.yml
================================================
name: Publish Wheels

permissions:
  contents: read
  
on:
  push:
    tags:
      - "v*.*.*"  # Triggers only for version tag pushes

jobs:
  build:
    name: Build and upload wheels
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]

    steps:
      - name: Set up Rust
        uses: dtolnay/rust-toolchain@stable
        with:
          toolchain: stable    
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}

      - name: Create virtual environment
        shell: bash
        run: |
          python -m venv .venv
          if [ -f ".venv/bin/activate" ]; then
              source .venv/bin/activate
          else
              source .venv/Scripts/activate
          fi          
          python -m pip install --upgrade pip maturin

      - name: Build wheel
        shell: bash
        run: |
          if [ -f ".venv/bin/activate" ]; then
              source .venv/bin/activate
          else
              source .venv/Scripts/activate
          fi

          cd python
          maturin build --release

      - name: Upload to TestPyPI
        shell: bash
        run: |
          if [ -f ".venv/bin/activate" ]; then
              source .venv/bin/activate
          else
              source .venv/Scripts/activate
          fi

          cd python
          maturin upload --skip-existing $GITHUB_WORKSPACE/target/wheels/*
        env:
          MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}  

================================================
FILE: .github/workflows/rust.yml
================================================
name: Rust

permissions:
  contents: read
  checks: write
  pull-requests: write 

on:
  push:
    branches: ["main"]
  pull_request:

env:
  CARGO_TERM_COLOR: always

jobs:
  build:
    runs-on: windows-latest

    steps:
      - uses: actions/checkout@v3
      - uses: dtolnay/rust-toolchain@stable
        with:
          toolchain: stable
          targets: wasm32-wasip1,aarch64-unknown-linux-musl,x86_64-pc-windows-msvc,x86_64-unknown-linux-gnu
          components: rustfmt,clippy

      # Install nextest
      - uses: taiki-e/install-action@v2
        with:
          tool: nextest

      - name: Check formatting
        run: cargo fmt --check --all
      - name: Build default target
        run: cargo build --locked --workspace
      - name: Build wasm32-wasip1
        run: cargo build --locked --target wasm32-wasip1 --manifest-path lib/Cargo.toml
      - name: Build aarch64-unknown-linux-musl
        run: cargo build --locked --target aarch64-unknown-linux-musl --manifest-path lib/Cargo.toml
      - name: Build x86_64-pc-windows-msvc
        run: cargo build --locked --target x86_64-pc-windows-msvc --lib --workspace
      - name: Build x86_64-pc-windows-msvc release
        run: cargo build --locked --target x86_64-pc-windows-msvc --lib --workspace --release
      - name: Test python interface build
        run: | 
          cd python
          cargo build --locked
          python -m venv .env
          .env\Scripts\activate
          pip install maturin pytest
          maturin develop --locked
          pytest --junitxml=results.xml

      # Run tests with nextest and output JUnit results
      - name: Run tests
        run: |
          cargo nextest run --workspace --profile ci

      # Upload test results so GitHub shows pass/fail per test
      - name: Upload test results
        uses: EnricoMi/publish-unit-test-result-action/windows@v2
        if: always() # Upload even if tests fail
        with:
          files: |
             target/nextest/ci/junit.xml
             python/results.xml
        

================================================
FILE: .gitignore
================================================
# Generated by Cargo
# will have compiled files and executables
/target/

# These are backup files generated by rustfmt
**/*.rs.bk

# VSCode
.vs/

# don't checkin lock file for fuzz as it should get autogen
fuzz/Cargo.lock

# ignore python virtual env
.env/
__pycache__

================================================
FILE: .vscode/launch.json
================================================
{
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "(Windows) Launch",
            "type": "cppvsdbg",
            "request": "launch",
            "program": "${workspaceFolder}/target/debug/lepton_jpeg_util.exe",
            "args": ["-dump", "${workspaceFolder}\\images\\slrcity.jpg"],
            "stopAtEntry": false,
            "cwd": "${fileDirname}",
            "environment": [],
            "console": "externalTerminal",
        },
        {
            "name": "Debug unit test",
            "type": "cppvsdbg",
            "request": "launch",
            "program": "${workspaceFolder}/target/debug/deps/end_to_end-5c5d5b533f217bea.exe",
            "args": [ "verify_extern_16bit_math_retry" ],
            "stopAtEntry": false,
            "cwd": "${workspaceFolder}",
            "environment": [],
            "console": "externalTerminal",
            "preLaunchTask": "rust: cargo test norun",
        },
    ]
}

================================================
FILE: .vscode/tasks.json
================================================
{
    "version": "2.0.0",
    "tasks": [
        {
            "type": "cargo",
            "command": "test",
            "args": [
                "--no-run"
            ],
            "problemMatcher": [
                "$rustc"
            ],
            "group": "test",
            "label": "rust: cargo test norun"
        },
        {
            "type": "cargo",
            "command": "build",
            "problemMatcher": [
                "$rustc"
            ],
            "args": [
                "--all"
            ],
            "group": "build",
            "label": "rust: cargo build"
        }
    ]
}

================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Microsoft Open Source Code of Conduct

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).

Resources:

- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to,
and actually do, grant us the rights to use your contribution. For details, visit
https://cla.microsoft.com.

When you submit a pull request, a CLA-bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
instructions provided by the bot. You will only need to do this once across all repositories using our CLA.

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

================================================
FILE: Cargo.toml
================================================
[package]
name = "lepton_jpeg_root"
edition = "2024"
authors = ["Kristof Roomp <kristofr@microsoft.com>"]

[workspace.package]
version = "0.5.8"
edition = "2024"

[profile.release]
debug = true
lto = true

[workspace]
members = ["lib", "dll", "util", "python"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dev-dependencies]
lepton_jpeg = { path = "lib", features = ["micro_benchmark"] }
rstest = "0.22"
rand = "0.8"
rand_chacha = "0.3"
siphasher = "1"
criterion = "0.7"

[[bench]]
name = "benches"
harness = false

================================================
FILE: DESIGN.md
================================================
# Design

## Overall approach

This library is designed to encode/decode JPEGs in a compressed file format that typically compresses files by about 20%. The overall approach is as follows:

- Split JPEG into metadata/headers (stored as an binary array) and scan data (which is stored as a set of arrays of 8x8 16 bit coefficients per color channel)
- The headers/metadata are compressed via Zlib and stored at the beginning of the lepton compressed files
- The scan data is Huffman decoded, while verifying that it was encoded canonically (this is important since we canonically encode so that it is binary identical)
- The scan data is encoded using the VP8 CABAC, with coefficients binarized using [Exponential-Golomb coding](https://en.wikipedia.org/wiki/Exponential-Golomb_coding). The bins for the CABAC encoder are determined by a fairly complex predictor model for:
  - DC (the top left corner coefficient)
  - The top and left edges (which are correlated to the previous blocks)
  - The 7x7  block (the remaining 49 coefficients that are not the DC or the edges)
  - It is vital that the model is identically and deterministically updated during encoding and decoding, since any discrepancy will rapidly cause the encoder and decoder to get out of sync and fail to decode the image
- In order to increase response time, the scan data is partitioned by up to 8 into horizontal sections, each of which can be encoded/decode on a separate thread. 
- Progressive JPEGs are handled slightly differently since they cannot be partitioned during the JPEG encoding step, since each progressive scan requires access to the entire image data.
- As a last verification, the entire process is run in reverse to ensure that we can recreate the binary-identical JPEG

## Layers

The main layers of the library are as follows:

- `lepton_format.rs` implements reading and writing Lepton format files and launching partitioned decoder/encoder threads
- `lepton_encoder.rs` / `lepton_decoder.rs` perform the actual scan encoding/decoding using `model.rs` to track the bin probabilities in `branch.rs`
- JPEGs are read and written by `jpeg_header.rs, jpeg_read.rs, jpeg_write.rs`, `jpeg_position_state.rs` which are invoked by `lepton_format.rs`
- `bit_reader.rs / bit_writer.rs` are used by the Huffman encoding/decoding for reading writing JPEG format scan data
- `vpx_bool_reader.rs / vpx_bool_writer.rs` are the CABAC encoder/decoder that using an arithmetic encoded binary stream with the probability of each bin is calculated in `brach.rs`. 
- `idct.rs` performs an inverse DCT of the JPEG coefficients as part of predicting the pixel values of neighbouring blocks
- `thread_handoff.rs` used to partition the JPEG scan data so that multiple threads can process the same image. 

================================================
FILE: LICENSE.txt
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: NOTICE.txt
================================================
Lepton JPEG compression Rust Port Copyright (c) Microsoft Corporation

NOTICES AND INFORMATION
Do Not Translate or Localize

This software incorporates material from third parties.
Microsoft makes certain open source code available at https://3rdpartysource.microsoft.com,
or you may send a check or money order for US $5.00, including the product name,
the open source component name, platform, and version number, to:

Source Code Compliance Team
Microsoft Corporation
One Microsoft Way
Redmond, WA 98052
USA

Notwithstanding any other terms, you may reverse engineer this software to the extent
required to debug changes to any libraries licensed under the GNU Lesser General Public License.

-------------------

This software includes parts of the Dropbox Lepton project (https://github.com/dropbox/lepton). 
Lepton is a tool and file format for losslessly compressing JPEGs by an average of 22%. Lepton is
licensed under Apache License 2.0, you can find a copy of this license at https://github.com/dropbox/lepton/blob/master/LICENSE

-------------------

This software includes parts of the Libwebp project https://github.com/webmproject/libwebp
licensed under BSD 3-Clause License, you can find a copy of this license at https://github.com/webmproject/libwebp/blob/main/COPYING

----------------

This software includes parts of the uncmpJPG project, which is a library to for lossless JPEG decompression
licensed under BSD 2-Clause License, you can find a copy of this license at  http://packjpg.encode.su/?page_id=178


================================================
FILE: README.md
================================================
# Lepton JPEG Compression in Rust 
[![Rust Community](https://img.shields.io/badge/Rust_Community%20-Join_us-brightgreen?style=plastic&logo=rust)](https://www.rust-lang.org/community)

This is a port of the C++ Lepton JPEG compression tool that was released by DropBox [dropbox/lepton](https://github.com/dropbox/lepton). We developed a port of the library to Rust, which has basically the same performance characteristics with the advantage of all the safety features that Rust has to offer, due to the work involved in performing an exhaustive security check on the C++ code and the fact that DropBox has deprecated the codebase.

With precise bit-by-bit recovery of the original JPEG, the Lepton compression library is designed for lossless compression of baseline and progressive JPEGs up to 22%. JPEG storage in a cloud storage system is the main application case. Even metadata headers and invalid content are kept in good condition.

## How to Use This Library

### Rust

The libary is published on crates.io as [lepton_jpeg](https://crates.io/crates/lepton_jpeg). 

### Python 

The library is published on PyPI as *lepton_jpeg_python*.

```
pip install lepton_jpeg_python
```

``` Python
import lepton_jpeg_python

with open("../images/slrcity.jpg", "rb") as f:
    jpg_data = f.read()

compressed = lepton_jpeg_python.compress_bytes(jpg_data, config)
decompressed = lepton_jpeg_python.decompress_bytes(compressed, config)

assert jpg_data == decompressed
```

### Building From Source

- [Rust 1.89 or Above](https://www.rust-lang.org/tools/install)

``` bash
git clone https://github.com/microsoft/lepton_jpeg_rust
cd lepton_jpeg_rust
cargo build
cargo test
cargo build --release
```

Some operations of this library are vectorized such as the IDCT using the [Wide](https://crates.io/crates/wide) crate, so you can get a significant boost if you enable +AVX2.

### Executable

Building the Rust project generates an `lepton_jpeg_util.exe` wrapper that is built as part of the project. It can be used to compress/decompress and also to verify the test end-to-end on a given JPEG. If the input file has a `.jpg` extension, it will encode. If the input file has a `.lep` extension, it will decode back to the original`.jpg`.

It supports the following options:

`lepton_jpeg_util.exe [options] <inputfile> [<outputfile>]`

| Option                  | Description                                                  |
| ----------------------- | ------------------------------------------------------------ |
| `-threads:n`            | Runs with a maximum of n threads. For encoding, this limits the amount of parallelism that can be gotten out of the decoder. |
| `-dump`                 | Dumps the contents of a JPG or LEP file, with the `-all` option, it will also dump the cooefficient image blocks. |
| `-noprogressive`        | Will cause an error if we encounter a progressive file rather than trying to encode it. |
| `-acceptdqtswithzeros`  | Accept images with DQTs with zeros (may cause divide-by-zero). |
| `-iter:n`               | Runs N iterations of the operation. Useful when we are running inside a profiler. |
| `-max-width:n`          | Limit the maximum image width to n pixels, instead of the default 16386. Fails with an error if limit is exceeded. |
| `-max-height:n`         | Limit the maximum image height to n pixels, instead of the default 16386. Fails with an error il limit is exceeded. |

## Contributing

There are many ways in which you can participate in this project, for example:

* [Submit bugs and feature requests](https://github.com/microsoft/lepton_jpeg_rust/issues), and help us verify as they are checked in
* Review [source code changes](https://github.com/microsoft/lepton_jpeg_rust/pulls) or submit your own features as pull requests.
* The library uses only **stable features**, so if you want to take advantage of SIMD features such as AVX2, use the Wide crate (see the idct.rs as an example) rather than intrinsics. 

## Code of Conduct

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

## License

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the [Apache 2.0](LICENSE.txt) license.


================================================
FILE: SECURITY.md
================================================
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->

## Security

Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).

If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.

## Reporting Security Issues

**Please do not report security vulnerabilities through public GitHub issues.**

Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).

If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).

You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 

Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:

  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
  * Full paths of source file(s) related to the manifestation of the issue
  * The location of the affected source code (tag/branch/commit or direct URL)
  * Any special configuration required to reproduce the issue
  * Step-by-step instructions to reproduce the issue
  * Proof-of-concept or exploit code (if possible)
  * Impact of the issue, including how an attacker might exploit the issue

This information will help us triage your report more quickly.

If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.

## Preferred Languages

We prefer all communications to be in English.

## Policy

Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).

<!-- END MICROSOFT SECURITY.MD BLOCK -->


================================================
FILE: azure-pipelines.yml
================================================
trigger:
  branches:
    include:
    - main
  tags:
    include:
    - v*.*.*

pr:
  branches:
    include:
    - main

resources:
  repositories:
  - repository: odspPipelines
    type: git
    name: EFun/ODSPTemplates
    ref: refs/heads/main

variables:
- name: toolchainFeed
  value: https://onedrive.pkgs.visualstudio.com/b52099a6-3b13-4b08-9270-a07884a10e3d/_packaging/RustTools/nuget/v3/index.json
- name: cratesIoFeed
  value: sparse+https://onedrive.pkgs.visualstudio.com/b52099a6-3b13-4b08-9270-a07884a10e3d/_packaging/RustCratesIO/Cargo/index/

extends:
  template: v1/OdspPipeline.yml@odspPipelines
  parameters:
    settings:
      template: Official
    sdl:
      clippy:
        enabled: false
    stages:

    ## For more details on the Rust build workflow, see https://eng.ms/docs/cloud-ai-platform/devdiv/one-engineering-system-1es/1es-docs/1es-pipeline-templates/features/buildworkflows/rust
    - stage: BuildStage
      displayName: 🏗️ Cargo build
      pool:
        name: 1ESHostedAgents_Windows2022_v2
        os: windows
      jobs:
          - job: BuildJob
            displayName: 🏗️ Cargo build
            templateContext:
              type: buildJob
              workflow: Rust
              rust:
                rustToolchain:
                  version: ms-prod-1.90
                  toolchainFeed: $(toolchainFeed)
                  cratesIoFeed: $(cratesIoFeed)
                target: x86_64-pc-windows-msvc
                command: custom
              sdl:
                componentgovernance:
                  enabled: true
                  failOnAlert: true
              postBuildSteps:
                - script: |
                    cargo build --workspace --locked 2>&1
                    cargo test --no-run --workspace --locked 2>&1

                  displayName: "Build debug"

                - script: |
                    cargo install junit-test
                    junit-test
                    copy junit.xml $(System.DefaultWorkingDirectory)\TEST-rust.xml
                    rd /s /q target\debug
                  displayName: 'Test debug'

                - task: PublishTestResults@2
                  displayName: 'Publish Test Results **/TEST-*.xml'
                  inputs:
                    mergeTestResults: true

                - script: |
                    set CL=/Qspectre /sdl /Zi /W3
                    set RUSTFLAGS=-Ccontrol-flow-guard -Ctarget-feature=+crt-static,+avx2,+lzcnt -Clink-args=/DYNAMICBASE -Clink-args=/CETCOMPAT
                    cargo build --workspace --locked --release 2>&1
                    copy target\release\lepton_jpeg.dll target\release\lepton_jpeg_avx2.dll
                    copy target\release\lepton_jpeg.pdb target\release\lepton_jpeg_avx2.pdb
                    copy target\release\lepton_jpeg_util.exe target\release\lepton_jpeg_util_avx2.exe
                    copy target\release\lepton_jpeg_util.pdb target\release\lepton_jpeg_util_avx2.pdb
                    set RUSTFLAGS=-Ccontrol-flow-guard -Ctarget-feature=+crt-static -Clink-args=/DYNAMICBASE -Clink-args=/CETCOMPAT
                    cargo build --workspace --locked --release 2>&1
                    rd /s /q target\release\build

                  displayName: 'Build Release'

                - task: UseDotNet@2
                  inputs:
                    packageType: 'sdk'
                    version: '6.x'

                - task: EsrpCodeSigning@5
                  inputs:
                    ConnectedServiceName: 'ESRP CodeSigningV2-OneDrive Service'
                    AppRegistrationClientId: 'bd3fbc52-4cf5-4cca-a25d-94160e5ed309'
                    AppRegistrationTenantId: 'cdc5aeea-15c5-4db6-b079-fcadd2505dc2'
                    AuthAKVName: 'ODSP-ESRP'
                    AuthCertName: 'ODSP-ESRP-Auth-V2'
                    AuthSignCertName: 'CodeSigningCertificate'
                    FolderPath: '$(Build.SourcesDirectory)'
                    Pattern: '
                      target\release\lepton_jpeg.dll,
                      target\release\lepton_jpeg_avx2.dll,
                      target\release\lepton_jpeg_util.exe,
                      target\release\lepton_jpeg_util_avx2.exe'
                    signConfigType: 'inlineSignParams'
                    inlineOperation: |
                      [
                      {
                        "KeyCode": "CP-401405",
                        "OperationCode": "SigntoolSign",
                        "ToolName": "sign",
                        "ToolVersion": "1.0",
                        "Parameters": {
                        "OpusName": "Microsoft",
                        "OpusInfo": "https://www.microsoft.com",
                        "FileDigest": "/fd SHA256",
                        "PageHash": "/NPH",
                        "TimeStamp": "/tr \"http://rfc3161.gtm.corp.microsoft.com/TSS/HttpTspServer\" /td sha256"
                        }
                      },
                      {
                        "KeyCode": "CP-401405",
                        "OperationCode": "SigntoolVerify",
                        "ToolName": "sign",
                        "ToolVersion": "1.0",
                        "Parameters": {}
                      }
                      ]
                    SessionTimeout: '60'
                    MaxConcurrency: '50'
                    MaxRetryAttempts: '5'
                    PendingAnalysisWaitTimeoutMinutes: '5'

                - task: CopyFiles@2
                  displayName: 'Copy Rust output files to: $(Build.ArtifactStagingDirectory) copy'
                  inputs:
                    SourceFolder: '$(Build.SourcesDirectory)'
                    Contents: |
                      target\debug\?(*.dll|*.exe|*.pdb)
                      target\release\?(*.dll|*.exe|*.pdb)

                    TargetFolder: '$(Build.ArtifactStagingDirectory)'

                - task: PublishSymbols@2
                  displayName: 'Publish symbols copy'
                  inputs:
                    SymbolsFolder: '$(Build.ArtifactStagingDirectory)'
                    SearchPattern: '**\*.pdb'
                    SymbolServerType: TeamServices

                - task: NuGetCommand@2
                  displayName: 'NuGet pack'
                  inputs:
                    command: pack
                    packagesToPack: package/Lepton.Jpeg.Rust.nuspec

                - task: 1ES.PublishNuGet@1
                  displayName: 'NuGet push'
                  condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/v'))
                  inputs:
                    packageParentPath: '$(Pipeline.Workspace)'
                    packagesToPush: '$(Build.ArtifactStagingDirectory)\*.nupkg'
                    publishVstsFeed: 'b87285d9-99ab-48db-a000-cb0cc8a2a1b5'
                    allowPackageConflicts: true


================================================
FILE: benches/benches.rs
================================================
use std::{io::Cursor, time::Duration};

use criterion::{Criterion, criterion_group, criterion_main};
use lepton_jpeg::{EnabledFeatures, SingleThreadPool};

fn read_file(filename: &str, ext: &str) -> Vec<u8> {
    let filename = std::path::Path::new(env!("WORKSPACE_ROOT"))
        .join("images")
        .join(filename.to_owned() + ext);
    //println!("reading {0}", filename.to_str().unwrap());
    let mut f = std::fs::File::open(filename).unwrap();

    let mut content = Vec::new();
    std::io::Read::read_to_end(&mut f, &mut content).unwrap();

    content
}

fn end_to_end_benches(c: &mut Criterion) {
    let thread_pool = SingleThreadPool::default();
    let mut g = c.benchmark_group("end_to_end");
    g.sampling_mode(criterion::SamplingMode::Flat);
    g.warm_up_time(Duration::from_secs(1));
    g.measurement_time(Duration::from_secs(10));

    let jpeg = read_file("iphone", ".jpg");
    let lep = read_file("iphone", ".lep");

    g.bench_function("Lepton encode", |b| {
        b.iter(|| {
            let mut output = Vec::with_capacity(jpeg.len());
            lepton_jpeg::encode_lepton(
                &mut Cursor::new(&jpeg),
                &mut Cursor::new(&mut output),
                &EnabledFeatures::compat_lepton_vector_write(),
                &thread_pool,
            )
        })
    });

    g.bench_function("Lepton decode", |b| {
        b.iter(|| {
            let mut output = Vec::with_capacity(lep.len());
            lepton_jpeg::decode_lepton(
                &mut Cursor::new(&lep),
                &mut Cursor::new(&mut output),
                &EnabledFeatures::compat_lepton_vector_write(),
                &thread_pool,
            )
        })
    });

    g.finish();
}

criterion_group!(group1, end_to_end_benches);

fn micro_benchmarks(c: &mut Criterion) {
    use lepton_jpeg::micro_benchmark::{
        benchmark_idct, benchmark_read_block, benchmark_read_jpeg, benchmark_roundtrip_coefficient,
        benchmark_write_block, benchmark_write_jpeg,
    };

    c.bench_function("jpeg read", |b| b.iter(benchmark_read_jpeg()));

    c.bench_function("jpeg write", |b| b.iter(benchmark_write_jpeg()));

    c.bench_function("roundtrip coefficient write", |b| {
        b.iter(benchmark_roundtrip_coefficient())
    });

    c.bench_function("idct benchmark", |b| b.iter(benchmark_idct()));

    c.bench_function("jpeg read block", |b| b.iter(benchmark_read_block()));

    c.bench_function("jpeg write block", |b| b.iter(benchmark_write_block()));
}

criterion_group!(group2, micro_benchmarks);

criterion_main!(group1, group2);


================================================
FILE: dll/Cargo.toml
================================================
[package]
name = "lepton_jpeg_dll"
version.workspace = true
edition = "2024"
authors = ["Kristof Roomp <kristofr@microsoft.com>"]

[dependencies]
lepton_jpeg = { path = "../lib" }
rayon = "1"
msvc_spectre_libs = "0.1.3"

[dev-dependencies]
rstest = "0.22"

[lib]
crate-type = ["cdylib"]
name = "lepton_jpeg"

================================================
FILE: dll/src/lib.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

#![forbid(trivial_numeric_casts)]

use std::{
    collections::VecDeque,
    io::Cursor,
    sync::{
        LazyLock,
        atomic::{AtomicU32, Ordering},
    },
};

use lepton_jpeg::{
    DEFAULT_THREAD_POOL, EnabledFeatures, ExitCode, LeptonFileReader, LeptonThreadPool,
    SingleThreadPool, ThreadPoolHolder, catch_unwind_result, decode_lepton, encode_lepton,
    get_git_version,
};

/// copies a string into a limited length zero terminated utf8 buffer
fn copy_cstring_utf8_to_buffer(str: &str, target_error_string: &mut [u8]) {
    if target_error_string.len() == 0 {
        return;
    }

    // copy error string into the buffer as utf8
    let b = std::ffi::CString::new(str).unwrap();
    let b = b.as_bytes();

    let copy_len = std::cmp::min(b.len(), target_error_string.len() - 1);

    // copy string into buffer as much as fits
    target_error_string[0..copy_len].copy_from_slice(&b[0..copy_len]);

    // always null terminated
    target_error_string[copy_len] = 0;
}

struct RayonThreadPool {
    pool: LazyLock<rayon::ThreadPool>,
}

impl LeptonThreadPool for RayonThreadPool {
    fn max_parallelism(&self) -> usize {
        NUM_THREADS.load(Ordering::SeqCst) as usize
    }
    fn run(&self, f: Box<dyn FnOnce() + Send + 'static>) {
        self.pool.spawn(f);
    }
}

static NUM_THREADS: AtomicU32 = AtomicU32::new(8);

static RAYON_THREAD_POOL: RayonThreadPool = RayonThreadPool {
    pool: LazyLock::new(|| {
        rayon::ThreadPoolBuilder::new()
            .num_threads(NUM_THREADS.load(Ordering::SeqCst) as usize) // default to 8 threads, can be adjusted
            .build()
            .unwrap()
    }),
};

/// C ABI interface for setting the number of threads to use for compression and decompression
/// This can only be called before any compression or decompression is done, as we cannot
/// change the number of threads in the threadpool once it is created.
pub unsafe extern "C" fn set_num_threads(num_threads: u32) {
    NUM_THREADS.store(num_threads, Ordering::SeqCst);
}

/// C ABI interface for compressing image, exposed from DLL
#[unsafe(no_mangle)]
#[allow(non_snake_case, unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn WrapperCompressImage(
    input_buffer: *const u8,
    input_buffer_size: u64,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    number_of_threads: i32,
    result_size: *mut u64,
) -> i32 {
    let mut cpu_usage: u64 = 0;
    WrapperCompressImage3(
        input_buffer,
        input_buffer_size,
        output_buffer,
        output_buffer_size,
        number_of_threads as u32,
        result_size,
        (&mut cpu_usage) as *mut u64,
        0,
        std::ptr::null_mut(),
        0,
    )
}

/// C ABI interface for compressing image, exposed from DLL
#[unsafe(no_mangle)]
#[allow(non_snake_case, unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn WrapperCompressImage2(
    input_buffer: *const u8,
    input_buffer_size: u64,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    number_of_threads: u32,
    result_size: *mut u64,
    cpu_usage: *mut u64,
    flags: u32,
) -> i32 {
    WrapperCompressImage3(
        input_buffer,
        input_buffer_size,
        output_buffer,
        output_buffer_size,
        number_of_threads,
        result_size,
        cpu_usage,
        flags,
        std::ptr::null_mut(),
        0,
    )
}

/// C ABI interface for compressing image, exposed from DLL
#[unsafe(no_mangle)]
#[allow(non_snake_case, unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn WrapperCompressImage3(
    input_buffer: *const u8,
    input_buffer_size: u64,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    number_of_threads: u32,
    result_size: *mut u64,
    cpu_usage: *mut u64,
    flags: u32,
    error_string: *mut std::os::raw::c_uchar,
    error_string_buffer_len: u64,
) -> i32 {
    match catch_unwind_result(|| {
        let input = std::slice::from_raw_parts(input_buffer, input_buffer_size as usize);

        let output = std::slice::from_raw_parts_mut(output_buffer, output_buffer_size as usize);

        let mut reader = Cursor::new(input);
        let mut writer = Cursor::new(output);

        let mut features = EnabledFeatures::compat_lepton_vector_write();
        if number_of_threads > 0 {
            features.max_partitions = number_of_threads;
        }

        let thread_pool: &dyn LeptonThreadPool = if flags & USE_RAYON_THREAD_POOL != 0 {
            &RAYON_THREAD_POOL
        } else if flags & USE_SINGLE_THREAD_POOL != 0 {
            &SingleThreadPool::default()
        } else {
            &DEFAULT_THREAD_POOL
        };

        let metrics = encode_lepton(&mut reader, &mut writer, &features, thread_pool)?;

        *result_size = writer.position().into();
        *cpu_usage = metrics.get_cpu_time_worker_time().as_millis() as u64;

        Ok(())
    }) {
        Ok(()) => {
            return 0;
        }
        Err(e) => {
            if error_string_buffer_len > 0 {
                copy_cstring_utf8_to_buffer(
                    e.message(),
                    std::slice::from_raw_parts_mut(error_string, error_string_buffer_len as usize),
                );
            }

            return e.exit_code().as_integer_error_code();
        }
    }
}

/// C ABI interface for decompressing image, exposed from DLL
#[unsafe(no_mangle)]
#[allow(non_snake_case, unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn WrapperDecompressImage(
    input_buffer: *const u8,
    input_buffer_size: u64,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    number_of_threads: i32,
    result_size: *mut u64,
) -> i32 {
    let mut cpu_usage: u64 = 0;
    return WrapperDecompressImage4(
        input_buffer,
        input_buffer_size,
        output_buffer,
        output_buffer_size,
        number_of_threads as u32,
        result_size,
        (&mut cpu_usage) as *mut u64,
        0,
        std::ptr::null_mut(),
        0,
    );
}

/// C ABI interface for decompressing image, exposed from DLL.
/// use_16bit_dc_estimate argument should be set to true only for images
/// that were compressed by C++ version of Leptron (see comments below).
#[unsafe(no_mangle)]
#[allow(non_snake_case, unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn WrapperDecompressImageEx(
    input_buffer: *const u8,
    input_buffer_size: u64,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    number_of_threads: i32,
    result_size: *mut u64,
    use_16bit_dc_estimate: bool,
) -> i32 {
    let mut cpu_usage: u64 = 0;
    WrapperDecompressImage4(
        input_buffer,
        input_buffer_size,
        output_buffer,
        output_buffer_size,
        number_of_threads as u32,
        result_size,
        (&mut cpu_usage) as *mut u64,
        if use_16bit_dc_estimate {
            DECOMPRESS_USE_16BIT_DC_ESTIMATE
        } else {
            0
        },
        std::ptr::null_mut(),
        0,
    )
}

/// C ABI interface for decompressing image, exposed from DLL.
#[unsafe(no_mangle)]
#[allow(non_snake_case, unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn WrapperDecompressImage3(
    input_buffer: *const u8,
    input_buffer_size: u64,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    number_of_threads: u32,
    result_size: *mut u64,
    cpu_usage: *mut u64,
    flags: u32,
) -> i32 {
    WrapperDecompressImage4(
        input_buffer,
        input_buffer_size,
        output_buffer,
        output_buffer_size,
        number_of_threads,
        result_size,
        cpu_usage,
        flags,
        std::ptr::null_mut(),
        0,
    )
}

/// C ABI interface for decompressing image, exposed from DLL.
#[unsafe(no_mangle)]
#[allow(non_snake_case, unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn WrapperDecompressImage4(
    input_buffer: *const u8,
    input_buffer_size: u64,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    number_of_threads: u32,
    result_size: *mut u64,
    cpu_usage: *mut u64,
    flags: u32,
    error_string: *mut std::os::raw::c_uchar,
    error_string_buffer_len: u64,
) -> i32 {
    match catch_unwind_result(|| {
        // For back-compat with C++ version we allow decompression of images with zeros in DQT tables

        // C++ version has a bug where it uses 16 bit math in the SIMD path and 32 bit math in the scalar path
        // depending on the compiler options. If use_16bit_dc_estimate=true, the decompression uses a back-compat
        // mode that considers it. The caller should set use_16bit_dc_estimate to true only for images that were
        // compressed by C++ version with relevant compiler options.

        // this is a bit of a mess since for a while we were encoded a mix of 16 and 32 bit math
        // (hence the two parameters in features).

        let mut enabled_features = EnabledFeatures {
            use_16bit_dc_estimate: (flags & DECOMPRESS_USE_16BIT_DC_ESTIMATE != 0),
            ..EnabledFeatures::compat_lepton_vector_read()
        };

        if number_of_threads > 0 {
            enabled_features.max_partitions = number_of_threads;
        }

        let thread_pool: &dyn LeptonThreadPool = if flags & USE_RAYON_THREAD_POOL != 0 {
            &RAYON_THREAD_POOL
        } else if flags & USE_SINGLE_THREAD_POOL != 0 {
            &SingleThreadPool::default()
        } else {
            &DEFAULT_THREAD_POOL
        };

        loop {
            let input = std::slice::from_raw_parts(input_buffer, input_buffer_size as usize);
            let output = std::slice::from_raw_parts_mut(output_buffer, output_buffer_size as usize);

            let mut reader = Cursor::new(input);
            let mut writer = Cursor::new(output);

            match decode_lepton(&mut reader, &mut writer, &mut enabled_features, thread_pool) {
                Ok(metrics) => {
                    *result_size = writer.position().into();
                    *cpu_usage = metrics.get_cpu_time_worker_time().as_millis() as u64;
                    return Ok(());
                }
                Err(e) => {
                    // The retry logic below runs if the caller did not pass use_16bit_dc_estimate=true, but the decompression
                    // encountered StreamInconsistent failure which is commonly caused by the the C++ 16 bit bug. In this case
                    // we retry the decompression with use_16bit_dc_estimate=true.
                    // Note that it's prefferable for the caller to pass use_16bit_dc_estimate properly and not to rely on this
                    // retry logic, that may miss some cases leading to bad (corrupted) decompression results.
                    if e.exit_code() == ExitCode::StreamInconsistent
                        && !enabled_features.use_16bit_dc_estimate
                    {
                        enabled_features.use_16bit_dc_estimate = true;
                        continue;
                    }

                    return Err(e.into());
                }
            }
        }
    }) {
        Ok(()) => {
            return 0;
        }
        Err(e) => {
            if error_string_buffer_len > 0 {
                copy_cstring_utf8_to_buffer(
                    e.message(),
                    std::slice::from_raw_parts_mut(error_string, error_string_buffer_len as usize),
                );
            }
            return e.exit_code().as_integer_error_code();
        }
    }
}

static PACKAGE_VERSION: &str = env!("CARGO_PKG_VERSION");

pub fn get_version_string() -> String {
    format!("{}-{}", PACKAGE_VERSION, get_git_version())
}

#[unsafe(no_mangle)]
pub unsafe extern "C" fn get_version(
    package: &mut *const std::os::raw::c_char,
    git: &mut *const std::os::raw::c_char,
) {
    *git = get_git_version().as_ptr() as *const std::os::raw::c_char;
    *package = PACKAGE_VERSION.as_ptr() as *const std::os::raw::c_char;
}

// wraps unmanaged context for decompression and tries to ensure that if is valid
// when passed in from C# or C++ code and that it is freed only once.
//
// Of course, there aren't any guarantees since passing raw pointers around is inherently unsafe,
// but we do our best to catch common mistakes and point the blame in the right direction.
struct DecompressionContext<'a> {
    magic: u32,
    internal: LeptonFileReader<'a>,
    extra_data: VecDeque<u8>,
}

const MAGIC_DECOMRESSION_CONTEXT: u32 = 0xdec0de00;

impl<'a> DecompressionContext<'a> {
    /// casts c pointer to a reference, verifying the magic number is OK so we can catch
    /// some common mistakes early. This is no guarantee, but helps crash early in many cases.
    unsafe fn from_pointer(ptr: *mut std::ffi::c_void) -> &'a mut Self {
        unsafe {
            let context = ptr as *mut DecompressionContext;
            assert_eq!(
                (*context).magic,
                MAGIC_DECOMRESSION_CONTEXT,
                "invalid context passed in"
            );
            &mut *context
        }
    }

    /// allocates a new context and returns a pointer to it
    unsafe fn new(internal: LeptonFileReader<'a>) -> *mut std::ffi::c_void {
        let context = Box::new(Self {
            magic: MAGIC_DECOMRESSION_CONTEXT,
            internal,
            extra_data: VecDeque::new(),
        });

        Box::into_raw(context) as *mut std::ffi::c_void
    }

    unsafe fn free(ptr: *mut std::ffi::c_void) {
        unsafe {
            let mut context = Box::from_raw(ptr as *mut DecompressionContext);
            assert_eq!(
                (*context).magic,
                MAGIC_DECOMRESSION_CONTEXT,
                "invalid context passed in"
            );
            // invalidate magic to catch double free
            context.magic = 0xdeaddead;

            // context is freed now by going out of scope
        }
    }
}

const DECOMPRESS_USE_16BIT_DC_ESTIMATE: u32 = 1;
const USE_RAYON_THREAD_POOL: u32 = 2;
const USE_SINGLE_THREAD_POOL: u32 = 4;

#[unsafe(no_mangle)]
pub unsafe extern "C" fn create_decompression_context(features: u32) -> *mut std::ffi::c_void {
    let enabled_features = EnabledFeatures {
        use_16bit_dc_estimate: (features & DECOMPRESS_USE_16BIT_DC_ESTIMATE != 0),
        ..EnabledFeatures::compat_lepton_vector_read()
    };

    let thread_pool: ThreadPoolHolder = if features & USE_RAYON_THREAD_POOL != 0 {
        ThreadPoolHolder::Dyn(&RAYON_THREAD_POOL)
    } else if features & USE_SINGLE_THREAD_POOL != 0 {
        ThreadPoolHolder::Owned(Box::new(SingleThreadPool::default()))
    } else {
        ThreadPoolHolder::Dyn(&DEFAULT_THREAD_POOL)
    };

    unsafe { DecompressionContext::new(LeptonFileReader::new(enabled_features, thread_pool)) }
}

#[unsafe(no_mangle)]
#[allow(unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn get_decompression_cpu(context: *mut std::ffi::c_void) -> u64 {
    let context = DecompressionContext::from_pointer(context);

    context
        .internal
        .metrics()
        .get_cpu_time_worker_time()
        .as_millis() as u64
}

#[unsafe(no_mangle)]
#[allow(unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn free_decompression_context(context: *mut std::ffi::c_void) {
    DecompressionContext::free(context);
}

/// partially decompresses an image from a Lepton file.
///
/// Returns -1 if more data is needed or if there is more data available, or 0 if done successfully.
/// Returns > 0 if there is an error
#[unsafe(no_mangle)]
#[allow(unsafe_op_in_unsafe_fn)]
pub unsafe extern "C" fn decompress_image(
    context: *mut std::ffi::c_void,
    input_buffer: *const u8,
    input_buffer_size: u64,
    input_complete: bool,
    output_buffer: *mut u8,
    output_buffer_size: u64,
    result_size: *mut u64,
    error_string: *mut std::os::raw::c_uchar,
    error_string_buffer_len: u64,
) -> i32 {
    match catch_unwind_result(|| {
        let context = DecompressionContext::from_pointer(context);

        let input = std::slice::from_raw_parts(input_buffer, input_buffer_size as usize);
        let output = std::slice::from_raw_parts_mut(output_buffer, output_buffer_size as usize);

        let (done, size) = context.internal.process_limited_buffer(
            input,
            input_complete,
            output,
            &mut context.extra_data,
        )?;

        *result_size = size as u64;
        Ok(done)
    }) {
        Ok(done) => {
            if done {
                0
            } else {
                -1
            }
        }
        Err(e) => {
            if error_string_buffer_len > 0 {
                copy_cstring_utf8_to_buffer(
                    e.message(),
                    std::slice::from_raw_parts_mut(error_string, error_string_buffer_len as usize),
                );
            }
            e.exit_code().as_integer_error_code()
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use rstest::rstest;

    fn read_file(filename: &str, ext: &str) -> Vec<u8> {
        let filename = std::path::Path::new(env!("WORKSPACE_ROOT"))
            .join("images")
            .join(filename.to_owned() + ext);
        //println!("reading {0}", filename.to_str().unwrap());
        let mut f = std::fs::File::open(filename).unwrap();

        let mut content = Vec::new();
        std::io::Read::read_to_end(&mut f, &mut content).unwrap();

        content
    }

    #[test]
    fn test_copy_cstring_utf8_to_buffer() {
        // test utf8
        let mut buffer = [0u8; 10];
        copy_cstring_utf8_to_buffer("h\u{00E1}llo", &mut buffer);
        assert_eq!(buffer, [b'h', 0xc3, 0xa1, b'l', b'l', b'o', 0, 0, 0, 0]);

        // test null termination
        let mut buffer = [0u8; 10];
        copy_cstring_utf8_to_buffer("helloeveryone", &mut buffer);
        assert_eq!(
            buffer,
            [b'h', b'e', b'l', b'l', b'o', b'e', b'v', b'e', b'r', 0]
        );
    }

    /// test original version of external interface that just delegates to the new one
    #[test]
    fn extern_interface() {
        let input = read_file("slrcity", ".jpg");

        let mut compressed = Vec::new();

        compressed.resize(input.len() + 10000, 0);

        let mut result_size: u64 = 0;

        unsafe {
            let retval = WrapperCompressImage(
                input[..].as_ptr(),
                input.len() as u64,
                compressed[..].as_mut_ptr(),
                compressed.len() as u64,
                8,
                (&mut result_size) as *mut u64,
            );

            assert_eq!(retval, 0);
        }

        let mut original = Vec::new();
        original.resize(input.len() + 10000, 0);

        let mut original_size: u64 = 0;
        unsafe {
            let retval = WrapperDecompressImageEx(
                compressed[..].as_ptr(),
                result_size,
                original[..].as_mut_ptr(),
                original.len() as u64,
                8,
                (&mut original_size) as *mut u64,
                false,
            );

            assert_eq!(retval, 0);
        }
        assert_eq!(input.len() as u64, original_size);
        assert_eq!(input[..], original[..(original_size as usize)]);
    }

    /// test version 2 of external interface
    #[test]
    fn extern_interface_2() {
        let input = read_file("slrcity", ".jpg");

        let mut compressed = Vec::new();

        compressed.resize(input.len() + 10000, 0);

        let mut result_size: u64 = 0;
        let mut cpu_usage: u64 = 0;

        unsafe {
            let retval = WrapperCompressImage2(
                input[..].as_ptr(),
                input.len() as u64,
                compressed[..].as_mut_ptr(),
                compressed.len() as u64,
                8,
                (&mut result_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                0,
            );

            assert_eq!(retval, 0);
        }

        let mut original = Vec::new();
        original.resize(input.len() + 10000, 0);

        let mut original_size: u64 = 0;
        unsafe {
            let retval = WrapperDecompressImage3(
                compressed[..].as_ptr(),
                result_size,
                original[..].as_mut_ptr(),
                original.len() as u64,
                8,
                (&mut original_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                0,
            );

            assert_eq!(retval, 0);
        }
        assert_eq!(input.len() as u64, original_size);
        assert_eq!(input[..], original[..(original_size as usize)]);
    }

    /// test version 2 of external interface with single thread
    #[test]
    fn extern_interface_2_single_thread() {
        let input = read_file("slrcity", ".jpg");

        let mut compressed = Vec::new();

        compressed.resize(input.len() + 10000, 0);

        let mut result_size: u64 = 0;
        let mut cpu_usage: u64 = 0;

        unsafe {
            let retval = WrapperCompressImage2(
                input[..].as_ptr(),
                input.len() as u64,
                compressed[..].as_mut_ptr(),
                compressed.len() as u64,
                8,
                (&mut result_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                USE_SINGLE_THREAD_POOL,
            );

            assert_eq!(retval, 0);
        }

        let mut original = Vec::new();
        original.resize(input.len() + 10000, 0);

        let mut original_size: u64 = 0;
        unsafe {
            let retval = WrapperDecompressImage3(
                compressed[..].as_ptr(),
                result_size,
                original[..].as_mut_ptr(),
                original.len() as u64,
                8,
                (&mut original_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                USE_SINGLE_THREAD_POOL,
            );

            assert_eq!(retval, 0);
        }
        assert_eq!(input.len() as u64, original_size);
        assert_eq!(input[..], original[..(original_size as usize)]);
    }

    /// test version 3 of external interface with single thread
    #[test]
    fn extern_interface_3_single_thread() {
        let input = read_file("slrcity", ".jpg");

        let mut compressed = Vec::new();

        compressed.resize(input.len() + 10000, 0);

        let mut result_size: u64 = 0;
        let mut cpu_usage: u64 = 0;

        let mut error_string = [0u8; 1024];

        unsafe {
            let retval = WrapperCompressImage3(
                input[..].as_ptr(),
                0,
                compressed[..].as_mut_ptr(),
                compressed.len() as u64,
                8,
                (&mut result_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                USE_SINGLE_THREAD_POOL,
                error_string.as_mut_ptr(),
                error_string.len() as u64,
            );
            // error string should complain about invalid input
            assert_ne!(retval, 0);

            // convert null terminated error_string into str
            let error_str = std::str::from_utf8(&error_string)
                .unwrap()
                .trim_end_matches(char::from(0));

            assert!(error_str.contains("jpeg must start with with 0xff 0xd8"));

            let retval = WrapperCompressImage3(
                input[..].as_ptr(),
                input.len() as u64,
                compressed[..].as_mut_ptr(),
                compressed.len() as u64,
                8,
                (&mut result_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                USE_SINGLE_THREAD_POOL,
                error_string.as_mut_ptr(),
                error_string.len() as u64,
            );

            assert_eq!(retval, 0);
        }

        let mut original = Vec::new();
        original.resize(input.len() + 10000, 0);

        let mut original_size: u64 = 0;
        unsafe {
            let retval = WrapperDecompressImage4(
                compressed[..].as_ptr(),
                result_size,
                original[..].as_mut_ptr(),
                original.len() as u64,
                8,
                (&mut original_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                USE_SINGLE_THREAD_POOL,
                error_string.as_mut_ptr(),
                error_string.len() as u64,
            );

            assert_eq!(retval, 0);
        }
        assert_eq!(input.len() as u64, original_size);
        assert_eq!(input[..], original[..(original_size as usize)]);
    }

    /// tests the chunked decompression interface
    #[rstest]
    fn extern_interface_decompress_chunked(
        #[values(DECOMPRESS_USE_16BIT_DC_ESTIMATE,DECOMPRESS_USE_16BIT_DC_ESTIMATE|USE_RAYON_THREAD_POOL)]
        flags: u32,
    ) {
        use std::io::Read;

        let input = read_file("slrcity", ".lep");

        let mut output = Vec::new();

        unsafe {
            let context = create_decompression_context(flags);

            let mut file_read = Cursor::new(input);
            let mut input_buffer = [0u8; 7];
            let mut output_buffer = [0u8; 13];

            let mut error_string = [0u8; 1024];

            loop {
                let amount_read = file_read.read(&mut input_buffer).unwrap();

                let mut result_size = 0;
                let result = decompress_image(
                    context,
                    input_buffer.as_ptr(),
                    amount_read as u64,
                    amount_read == 0,
                    output_buffer.as_mut_ptr(),
                    output_buffer.len() as u64,
                    &mut result_size,
                    error_string.as_mut_ptr(),
                    error_string.len() as u64,
                );

                output.extend_from_slice(&output_buffer[..result_size as usize]);

                match result {
                    -1 => {
                        // need more data
                    }
                    0 => {
                        break;
                    }
                    _ => {
                        panic!("unexpected error {0}", result);
                    }
                }
            }
            free_decompression_context(context);
        }

        let test_result = read_file("slrcity", ".jpg");
        assert_eq!(test_result.len(), output.len());
        assert!(test_result[..] == output[..]);
    }

    #[rstest]
    fn verify_extern_interface_rejects_compression_of_unsupported_jpegs(
        #[values(
        ("zeros_in_dqt_tables", ExitCode::UnsupportedJpegWithZeroIdct0), 
        ("nonoptimalprogressive", ExitCode::UnsupportedJpeg))]
        file: (&str, ExitCode),
    ) {
        let input = read_file(file.0, ".jpg");

        let mut compressed = Vec::new();
        compressed.resize(input.len() + 10000, 0);
        let mut result_size: u64 = 0;
        let mut cpu_usage: u64 = 0;

        unsafe {
            let retval = WrapperCompressImage2(
                input[..].as_ptr(),
                input.len() as u64,
                compressed[..].as_mut_ptr(),
                compressed.len() as u64,
                8,
                (&mut result_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                0,
            );

            assert_eq!(retval, file.1.as_integer_error_code());
        }
    }

    /// While we prevent compression of images with zeros in DQT tables, since it may lead to divide-by-zero, we support decompression of
    /// previously compressed images with this characteristics for back-compat.
    #[rstest]
    fn verify_extern_interface_supports_decompression_with_zeros_in_dqt_tables(
        #[values("zeros_in_dqt_tables")] file: &str,
    ) {
        let compressed = read_file(file, ".lep");
        let original = read_file(file, ".jpg");

        let mut decompressed = Vec::new();
        decompressed.resize(original.len() + 10000, 0);

        let mut decompressed_size: u64 = 0;
        let mut cpu_usage: u64 = 0;

        unsafe {
            let retval = WrapperDecompressImage3(
                compressed[..].as_ptr(),
                compressed.len() as u64,
                decompressed[..].as_mut_ptr(),
                decompressed.len() as u64,
                8,
                (&mut decompressed_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                0,
            );

            assert_eq!(retval, 0);
        }

        assert_eq!(original.len() as u64, decompressed_size);
        assert_eq!(original[..], decompressed[..(decompressed_size as usize)]);
    }

    /// Verifies that the decode will accept existing Lepton files and generate
    /// exactly the same jpeg from them when called by an external interface
    /// with use_16bit_dc_estimate=true for C++ backward compatibility.
    /// Used to detect unexpected divergences in coding format.
    #[rstest]
    fn verify_decode_external_interface_with_use_16bit_dc_estimate(
        #[values(
        "mathoverflow_16",
        "android",
        "androidcrop",
        "androidcropoptions",
        "androidprogressive",
        "androidprogressive_garbage",
        "androidtrail",
        "colorswap",
        "gray2sf",
        "grayscale",
        "hq",
        "iphone",
        "iphonecity",
        "iphonecity_with_16KGarbage",
        "iphonecity_with_1MGarbage",
        "iphonecrop",
        "iphonecrop2",
        "iphoneprogressive",
        "iphoneprogressive2",
        "progressive_late_dht", // image has huffman tables that come very late which causes a verification failure 
        "out_of_order_dqt",     // image with quanatization table dqt that comes after image definition SOF
        "narrowrst",
        "nofsync",
        "slrcity",
        "slrhills",
        "slrindoor",
        "tiny",
        "trailingrst",
        "trailingrst2",
        "trunc",
        "eof_and_trailingrst",    // the lepton format has a wrongly set unexpected eof and trailing rst
        "eof_and_trailinghdrdata" // the lepton format has a wrongly set unexpected eof and trailing header data
    )]
        file: &str,
    ) {
        println!("decoding {0:?}", file);

        let compressed = read_file(file, ".lep");
        let jpg_file_name = match file {
            "mathoverflow_16" => "mathoverflow",
            _ => file,
        };
        let input = read_file(jpg_file_name, ".jpg");

        let mut original = Vec::new();
        original.resize(input.len() + 10000, 0);

        let mut original_size: u64 = 0;
        let mut cpu_usage: u64 = 0;

        unsafe {
            let retval = WrapperDecompressImage3(
                compressed[..].as_ptr(),
                compressed.len() as u64,
                original[..].as_mut_ptr(),
                original.len() as u64,
                8,
                (&mut original_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                DECOMPRESS_USE_16BIT_DC_ESTIMATE,
            );

            assert_eq!(retval, 0);
        }
        assert_eq!(input.len() as u64, original_size);
        assert_eq!(input[..], original[..(original_size as usize)]);
    }

    #[test]
    fn verify_extern_16bit_math_retry() {
        // verify retry logic for 16 bit math encoded image
        let compressed = read_file("mathoverflow_16", ".lep");

        let input = read_file("mathoverflow", ".jpg");

        let mut original = Vec::new();
        original.resize(input.len() + 10000, 0);

        let mut original_size: u64 = 0;
        let mut cpu_usage: u64 = 0;

        unsafe {
            let retval = WrapperDecompressImage3(
                compressed[..].as_ptr(),
                compressed.len() as u64,
                original[..].as_mut_ptr(),
                original.len() as u64,
                8,
                (&mut original_size) as *mut u64,
                (&mut cpu_usage) as *mut u64,
                0,
            );

            assert_eq!(retval, 0);
        }
        assert_eq!(input.len() as u64, original_size);
        assert_eq!(input[..], original[..(original_size as usize)]);
    }
}


================================================
FILE: fuzz/.cargo/config.toml
================================================
[target.x86_64-unknown-linux-gnu]
rustflags = ["-Ctarget_cpu=native"]

================================================
FILE: fuzz/.gitignore
================================================
target
corpus
artifacts
coverage
*.log

================================================
FILE: fuzz/Cargo.toml
================================================
[package]
name = "lepton_jpeg-fuzz"
version = "0.0.0"
publish = false
edition = "2024"

[package.metadata]
cargo-fuzz = true

[dependencies]
libfuzzer-sys = "0.4"

[dependencies.lepton_jpeg]
path = "../lib"

# Prevent this from interfering with workspaces
[workspace]
members = ["."]

[profile.release]
debug = 1

[[bin]]
name = "fuzz_target_1"
path = "fuzz_targets/fuzz_target_1.rs"
test = false
doc = false


================================================
FILE: fuzz/fuzz_targets/fuzz_target_1.rs
================================================
#![no_main]

use std::io::Cursor;

use lepton_jpeg::{decode_lepton, encode_lepton, EnabledFeatures, DEFAULT_THREAD_POOL};

use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
    let r;

    let mut output = Vec::new();

    let use_16bit = match data.len() % 2 { 0 => false, _ => true };
    let accept_invalid_dht = match (data.len() / 2) % 2 { 0 => false, _ => true };

    // keep the jpeg dimensions small otherwise the fuzzer gets really slow
    let features = EnabledFeatures {
        progressive: true,
        reject_dqts_with_zeros: true,
        max_jpeg_height: 1024,
        max_jpeg_width: 1024,
        use_16bit_dc_estimate: use_16bit,
        use_16bit_adv_predict: use_16bit,
        accept_invalid_dht: accept_invalid_dht,
        .. EnabledFeatures::compat_lepton_vector_write()
    };

    {
        let mut writer = Cursor::new(&mut output);

        r = encode_lepton(&mut Cursor::new(&data), &mut writer, &features, &DEFAULT_THREAD_POOL);
    }

    let mut original = Vec::new();

    match r {
        Ok(_) => {
            let _ = decode_lepton(&mut Cursor::new(&output), &mut original, &features, &DEFAULT_THREAD_POOL);
        }
        Err(_) => {}
    }
});


================================================
FILE: fuzz/rust-toolchain.toml
================================================
[toolchain]
channel = "nightly"

================================================
FILE: images/hq.lep
================================================
[File too large to display: 18.5 MB]

================================================
FILE: images/iphonecity_with_16KGarbage.jpgoutput
================================================


================================================
FILE: images/t.lepoutput
================================================


================================================
FILE: lib/Cargo.toml
================================================
[package]
name = "lepton_jpeg"
version.workspace = true
edition = "2024"
authors = ["Kristof Roomp <kristofr@microsoft.com>"]

# requires scoped threads, IsTerminal, let chains
rust-version = "1.89"
description = "Rust port of the Lepton lossless JPEG compression library"
readme = "../README.md"
repository = "https://github.com/microsoft/lepton_jpeg_rust"
license = "Apache-2.0"

categories = ["multimedia::images", "multimedia::encoding", "compression"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
default = []
compression_stats = []
detailed_tracing = []

# used to expose internal functions for micro benchmarking
micro_benchmark = []

[dependencies]
bytemuck = "1"
byteorder = "1.4"
flate2 = "1.0"
default-boxed = "0.2"
wide = "0.8"
log = "0.4"
git-version = "0.3"

[target.'cfg(target_os = "windows")'.dependencies]
cpu-time = "1.0"
thread-priority = "1.0"

[target.'cfg(target_os = "linux")'.dependencies]
thread-priority = "1.0"

[dev-dependencies]
rand = "0.8"
rand_chacha = "0.3"
siphasher = "1"

[lib]
crate-type = ["lib"]


================================================
FILE: lib/src/consts.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use crate::jpeg::jpeg_code;

#[derive(PartialEq, Debug)]
pub enum JpegDecodeStatus {
    DecodeInProgress,
    RestartIntervalExpired,
    ScanCompleted,
}

#[derive(PartialEq, Debug, Copy, Clone)]
pub enum JpegType {
    Unknown,
    Sequential,
    Progressive,
}

pub const COLOR_CHANNEL_NUM_BLOCK_TYPES: usize = 3;

pub const RASTER_TO_ZIGZAG: [u8; 64] = [
    0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, 3, 8, 12, 17, 25, 30, 41, 43, 9, 11,
    18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34,
    37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63,
];

// pub const ZIGZAG_TO_RASTER: [u8; 64] = [
//     0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20,
//     13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59,
//     52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
// ];

pub const ZIGZAG_TO_TRANSPOSED: [u8; 64] = [
    0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, 27, 34,
    41, 48, 56, 49, 42, 35, 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31,
    38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63,
];

// pub const UNZIGZAG_49: [u8; 49] = [
//     9, 10, 17, 25, 18, 11, 12, 19, 26, 33, 41, 34, 27, 20, 13, 14, 21, 28, 35, 42, 49, 57, 50, 43,
//     36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62,
//     63,
// ];

pub const UNZIGZAG_49_TR: [u8; 49] = [
    9, 17, 10, 11, 18, 25, 33, 26, 19, 12, 13, 20, 27, 34, 41, 49, 42, 35, 28, 21, 14, 15, 22, 29,
    36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55,
    63,
];

// precalculated int base values for 8x8 IDCT scaled by 8192
// DC coef is zeroed intentionally
pub const ICOS_BASED_8192_SCALED: [i32; 8] = [0, 11363, 10703, 9633, 8192, 6436, 4433, 2260];

pub const ICOS_BASED_8192_SCALED_PM: [i32; 8] =
    [8192, -11363, 10703, -9633, 8192, -6436, 4433, -2260];

pub const FREQ_MAX: [u16; 14] = [
    931, 985, 968, 1020, 968, 1020, 1020, 932, 985, 967, 1020, 969, 1020, 1020,
];

// used to get prediction branches basing on nonzero-number predictor `num_non_zeros_context`
pub const NON_ZERO_TO_BIN: [u8; 26] = [
    0, 1, 2, 3, 4, 4, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8,
];

// used to get prediction branches basing on current `num_non_zeros_left_7x7`, 0th element is not used
pub const NON_ZERO_TO_BIN_7X7: [u8; 50] = [
    0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
];

pub const RESIDUAL_NOISE_FLOOR: usize = 7;

pub const LEPTON_VERSION: u8 = 1; // Lepton version, same as used by Lepton C++ since we support the same format

pub const SMALL_FILE_BYTES_PER_ENCDOING_THREAD: usize = 125000;
pub const MAX_THREADS_SUPPORTED_BY_LEPTON_FORMAT: usize = 16; // Number of threads minus 1 should fit in 4 bits

//pub const SingleFFByte : [u8;1] = [ 0xFF ];
pub const EOI: [u8; 2] = [0xFF, jpeg_code::EOI]; // EOI segment
pub const SOI: [u8; 2] = [0xFF, jpeg_code::SOI]; // SOI segment
pub const LEPTON_FILE_HEADER: [u8; 2] = [0xcf, 0x84]; // the tau symbol for a tau lepton in utf-8
pub const LEPTON_HEADER_BASELINE_JPEG_TYPE: [u8; 1] = [b'Z'];
pub const LEPTON_HEADER_PROGRESSIVE_JPEG_TYPE: [u8; 1] = [b'X'];
pub const LEPTON_HEADER_MARKER: [u8; 3] = *b"HDR";
pub const LEPTON_HEADER_PAD_MARKER: [u8; 3] = *b"P0D";
pub const LEPTON_HEADER_JPG_RESTARTS_MARKER: [u8; 3] = *b"CRS";
pub const LEPTON_HEADER_JPG_RESTART_ERRORS_MARKER: [u8; 3] = *b"FRS";
pub const LEPTON_HEADER_LUMA_SPLIT_MARKER: [u8; 2] = *b"HH";
pub const LEPTON_HEADER_EARLY_EOF_MARKER: [u8; 3] = *b"EEE";
pub const LEPTON_HEADER_PREFIX_GARBAGE_MARKER: [u8; 3] = *b"PGR";
pub const LEPTON_HEADER_GARBAGE_MARKER: [u8; 3] = *b"GRB";
pub const LEPTON_HEADER_COMPLETION_MARKER: [u8; 3] = *b"CMP";
//pub const ChunkedLeptonHeaderSizeMarker : [u8;3] = *b"SIZ" ;
//pub const ChunkedLeptonHeaderJpgHeaderDataRangeMarker : [u8;3] = *b"JHR";


================================================
FILE: lib/src/enabled_features.rs
================================================
/// Features that are enabled in the encoder. Turn off for potential backward compat issues.
#[derive(Debug, Clone)]
pub struct EnabledFeatures {
    /// enables/disables reading of progressive images
    pub progressive: bool,

    /// reject/accept images with DQTs with zeros (may cause divide-by-zero)
    pub reject_dqts_with_zeros: bool,

    /// maximum jpeg width
    pub max_jpeg_width: u32,

    /// maximum jpeg height
    pub max_jpeg_height: u32,

    /// Sadly C++ version has a bug where it uses 16 bit math in the SIMD path and 32 bit math in the scalar path
    pub use_16bit_dc_estimate: bool,

    /// Sadly C++ version has a bug where it uses 16 bit math in the SIMD path and 32 bit math in the scalar path
    pub use_16bit_adv_predict: bool,

    /// Accept JPEG files that have invalid DHT tables
    pub accept_invalid_dht: bool,

    /// number of partitions used for encoding
    pub max_partitions: u32,

    /// maximum number of threads to use for encoding/decoding
    pub max_processor_threads: u32,

    /// maximum size of a jpeg file
    pub max_jpeg_file_size: u32,

    /// stop reading at the end of the valid JPEG file. This is useful if
    /// the stream contains other data after the EOI marker.
    ///
    /// This also disallows handling truncated JPEG files since by definition
    /// they don't have an EOI marker, instead you will get a ShortRead error.
    pub stop_reading_at_eoi: bool,
}

impl EnabledFeatures {
    /// parameters that allow everything for encoding that is compatible with c++ lepton compiled with SIMD
    #[allow(dead_code)]
    pub fn compat_lepton_vector_write() -> Self {
        Self {
            progressive: true,
            reject_dqts_with_zeros: true,
            max_jpeg_height: 16386,
            max_jpeg_width: 16386,
            use_16bit_dc_estimate: true,
            use_16bit_adv_predict: true,
            accept_invalid_dht: false,
            max_partitions: 8,
            max_processor_threads: 8,
            max_jpeg_file_size: 128 * 1024 * 1024,
            stop_reading_at_eoi: false,
        }
    }

    /// parameters that allow everything for decoding c++ lepton images encoded
    /// with the scalar compile options
    #[allow(dead_code)]
    pub fn compat_lepton_scalar_read() -> Self {
        Self {
            progressive: true,
            reject_dqts_with_zeros: false,
            max_jpeg_height: u32::MAX,
            max_jpeg_width: u32::MAX,
            use_16bit_dc_estimate: false,
            use_16bit_adv_predict: false,
            accept_invalid_dht: true,
            max_partitions: 8,
            max_processor_threads: 8,
            max_jpeg_file_size: 128 * 1024 * 1024,
            stop_reading_at_eoi: false,
        }
    }

    /// parameters that allow everything for decoding c++ lepton images encoded
    /// with the vector (SSE2/AVX2) compile options
    #[allow(dead_code)]
    pub fn compat_lepton_vector_read() -> Self {
        Self {
            progressive: true,
            reject_dqts_with_zeros: false,
            max_jpeg_height: u32::MAX,
            max_jpeg_width: u32::MAX,
            use_16bit_dc_estimate: true,
            use_16bit_adv_predict: true,
            accept_invalid_dht: true,
            max_partitions: 8,
            max_processor_threads: 8,
            max_jpeg_file_size: 128 * 1024 * 1024,
            stop_reading_at_eoi: false,
        }
    }
}


================================================
FILE: lib/src/helpers.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::panic::{AssertUnwindSafe, catch_unwind};

use crate::lepton_error::{ExitCode, LeptonError};

/// Helper function to catch panics and convert them into the appropriate LeptonError
pub fn catch_unwind_result<R>(
    f: impl FnOnce() -> Result<R, LeptonError>,
) -> Result<R, LeptonError> {
    match catch_unwind(AssertUnwindSafe(f)) {
        Ok(r) => r.map_err(|e| e.into()),
        Err(err) => {
            if let Some(message) = err.downcast_ref::<&str>() {
                Err(LeptonError::new(ExitCode::AssertionFailure, *message))
            } else if let Some(message) = err.downcast_ref::<String>() {
                Err(LeptonError::new(ExitCode::AssertionFailure, message))
            } else {
                Err(LeptonError::new(
                    ExitCode::AssertionFailure,
                    "unknown panic",
                ))
            }
        }
    }
}

#[inline(always)]
pub const fn u16_bit_length(v: u16) -> u8 {
    return 16 - v.leading_zeros() as u8;
}

#[inline(always)]
pub const fn u32_bit_length(v: u32) -> u8 {
    return 32 - v.leading_zeros() as u8;
}

pub fn buffer_prefix_matches_marker<const BS: usize, const MS: usize>(
    buffer: [u8; BS],
    marker: [u8; MS],
) -> bool {
    // Helper method, skipping checks of parameters nulls/lengths
    for i in 0..marker.len() {
        if buffer[i] != marker[i] {
            return false;
        }
    }

    return true;
}

/// returns true if the 64 bit value contains an 0xff byte.
/// Uses fancy bit manipulation to avoid branches.
#[inline(always)]
pub fn has_ff(v: u64) -> bool {
    (v & 0x8080808080808080 & !v.wrapping_add(0x0101010101010101)) != 0
}

#[inline(always)]
pub const fn devli(s: u8, value: u16) -> i16 {
    let shifted = 1 << s;

    if value & (shifted >> 1) != 0 {
        value as i16
    } else {
        value.wrapping_add(2).wrapping_add(!shifted) as i16
    }
}

/// check to make sure the behavior hasn't changed even with the optimization
#[test]
fn devli_test() {
    for s in 0u8..15 {
        for value in 0..(1 << s) {
            assert_eq!(
                devli(s, value),
                if s == 0 {
                    value as i16
                } else if value < (1 << (s as u16 - 1)) {
                    value as i16 + (-1 << s as i16) + 1
                } else {
                    value as i16
                }
            );
        }
    }
}

#[inline(always)]
pub const fn b_short(v1: u8, v2: u8) -> u16 {
    ((v1 as u16) << 8) + v2 as u16
}

#[inline(always)]
pub const fn rbits(c: u8, n: usize) -> u8 {
    return c & (0xFF >> (8 - n));
}

#[inline(always)]
pub const fn lbits(c: u8, n: usize) -> u8 {
    return c >> (8 - n);
}

#[inline(always)]
pub const fn bitn(c: u16, n: u16) -> u8 {
    return ((c >> n) & 0x1) as u8;
}

#[inline(always)]
pub fn calc_sign_index(val: i16) -> usize {
    if val == 0 {
        0
    } else {
        if val > 0 { 1 } else { 2 }
    }
}

/// This checks to see if a vector can fit additional elements without growing,
/// but does it in such a way that the optimizer understands that a subsequent
/// push or extend will not need to grow the vector.
#[inline(always)]
pub fn needs_to_grow<T>(v: &Vec<T>, additional: usize) -> bool {
    additional > v.capacity().wrapping_sub(v.len())
}

#[cfg(test)]
pub fn get_rand_from_seed(seed: [u8; 32]) -> rand_chacha::ChaCha12Rng {
    use rand_chacha::ChaCha12Rng;
    use rand_chacha::rand_core::SeedableRng;

    ChaCha12Rng::from_seed(seed)
}

/// reads a file from the images directory for testing or benchmarking purposes
#[cfg(any(test, feature = "micro_benchmark"))]
pub fn read_file(filename: &str, ext: &str) -> Vec<u8> {
    use std::io::Read;

    let filename = std::path::Path::new(env!("WORKSPACE_ROOT"))
        .join("images")
        .join(filename.to_owned() + ext);
    let mut f = std::fs::File::open(filename).unwrap();

    let mut content = Vec::new();
    f.read_to_end(&mut content).unwrap();

    content
}

/*
better way to update aritmetic encoding without using special division

const fn k16bit_length(v : u32) -> u32
{
    const LEN_TABLE256 : [i8;256] =
    [
            0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
    ];

    return if (v & 0xff00) != 0 { 8 + LEN_TABLE256[(v >> 8) as usize] } else { LEN_TABLE256[v as usize] } as u32;
}

const LOG_MAX_NUMERATOR : i32= 18;

const fn calc_divisors() -> [u32;1026]
{
    let mut intermed = [0u32;1026];

    let mut d : u32  = 1;

    while d < 1026
    {
        intermed[d as usize] = ((((1 << k16bit_length(d)) - d) << LOG_MAX_NUMERATOR) / d) + 1;
        d += 1;
    }

    return intermed;
}

const DIVISORS : [u32;1026] = calc_divisors();

#[inline(always)]
pub fn fast_divide18bit_by_10bit(num : u32, denom : u16) -> u32
{
    //debug_assert_eq!(LOG2_LENGTHS[denom as usize], (16 - denom.leading_zeros() - 1) as u8, "log2{0}", denom);

    let tmp = ((DIVISORS[denom as usize] as u64 * num as u64) >> LOG_MAX_NUMERATOR) as u32;
    let r = (tmp + ((num - tmp) >> 1)) >> (16 - denom.leading_zeros() - 1);

    debug_assert_eq!(r, num/(denom as u32));
    return r;
}

*/


================================================
FILE: lib/src/jpeg/bit_reader.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::io::BufRead;

use super::jpeg_code;
use crate::helpers::has_ff;
use crate::lepton_error::{ExitCode, err_exit_code};
use crate::{LeptonError, StreamPosition};

// Implemenation of bit reader on top of JPEG data stream as read by a reader
pub struct BitReader<R> {
    inner: R,
    bits: u64,
    bits_left: u32,
    cpos: u32,
    eof: bool,
    truncated_ff: bool,
    read_ahead_bytes: u32,
}

impl<R: BufRead + StreamPosition> BitReader<R> {
    /// Returns the current position in the stream, which corresponds the byte that has
    /// unread bits in it.
    ///
    /// If the last byte was a 0xff, then the position is the byte before
    /// the 0xff.
    pub fn stream_position(&mut self) -> u64 {
        self.undo_read_ahead();

        let pos = self.inner.position();

        if self.bits_left > 0 && !self.eof {
            if self.bits as u8 == 0xff && !self.truncated_ff {
                return pos - 2;
            } else {
                return pos - 1;
            }
        } else {
            return pos;
        }
    }

    pub fn new(inner: R) -> Self {
        BitReader {
            inner: inner,
            bits: 0,
            bits_left: 0,
            cpos: 0,
            eof: false,
            truncated_ff: false,
            read_ahead_bytes: 0,
        }
    }
}

impl<R: BufRead> BitReader<R> {
    #[inline(always)]
    pub fn read(&mut self, bits_to_read: u32) -> std::io::Result<u16> {
        if bits_to_read == 0 {
            return Ok(0);
        }

        if self.bits_left < bits_to_read {
            self.fill_register(bits_to_read)?;
        }

        let retval =
            (self.bits >> (self.bits_left - bits_to_read) & ((1 << bits_to_read) - 1)) as u16;
        self.bits_left -= bits_to_read;
        return Ok(retval);
    }

    #[inline(always)]
    pub fn peek(&self) -> (u8, u32) {
        (
            ((self.bits.wrapping_shl(64 - self.bits_left)) >> 56) as u8,
            self.bits_left,
        )
    }

    #[inline(always)]
    pub fn advance(&mut self, bits: u32) {
        self.bits_left -= bits;
    }

    #[inline(always)]
    pub fn fill_register(&mut self, bits_to_read: u32) -> Result<(), std::io::Error> {
        // first consume the read_ahead bytes that we have now consumed
        // (otherwise we wouldn't have been called)
        self.inner.consume(self.read_ahead_bytes as usize);

        let fb = self.inner.fill_buf()?;

        // if we have 8 bytes and there is no 0xff in them, then we can just read the bits directly as big endian
        let mut v;
        if fb.len() < 8 || {
            v = u64::from_le_bytes(fb[..8].try_into().unwrap());
            has_ff(v)
        } {
            self.read_ahead_bytes = 0;
            return self.fill_register_slow(bits_to_read);
        }

        v = v.to_be();

        // only fill 63 bits not 64 to avoid having to special case
        // of self.bits << 64 which is a nop
        let bytes_to_read = (63 - self.bits_left) / 8;

        self.bits = self.bits << (bytes_to_read * 8) | v >> (64 - bytes_to_read * 8);
        self.bits_left += bytes_to_read * 8;
        self.read_ahead_bytes = (self.bits_left - bits_to_read) / 8;

        self.inner
            .consume((bytes_to_read - self.read_ahead_bytes) as usize);

        return Ok(());
    }

    #[cold]
    fn fill_register_slow(&mut self, bits_to_read: u32) -> Result<(), std::io::Error> {
        loop {
            let fb = self.inner.fill_buf()?;
            if let &[b, ..] = fb {
                self.inner.consume(1);

                // 0xff is an escape code, if the next by is zero, then it is just a normal 0
                // otherwise it is a reset code, which should also be skipped
                if b == 0xff {
                    let mut buffer = [0u8];

                    if self.inner.read(&mut buffer)? == 0 {
                        // Handle case of truncation in the middle of an escape: Since we assume that everything passed the end
                        // is a 0, if the file ends with 0xFF, then we have to assume that this was
                        // an escaped 0xff. Don't mark as eof yet, since there are still the 8 bits to read.
                        self.bits = (self.bits << 8) | 0xff;
                        self.bits_left += 8;
                        self.truncated_ff = true;

                        // continue since we still might need to read more 0 bits
                    } else if buffer[0] == 0 {
                        // this was an escaped FF
                        self.bits = (self.bits << 8) | 0xff;
                        self.bits_left += 8;
                    } else {
                        // this was not an escaped 0xff which is the only thing we accept at this part of the decoding.
                        //
                        // verify_reset_code should have gotten called in all instances where there should be a reset code,
                        // or at the end of the file we should have stopped decoding before we hit the end of file marker.
                        //
                        // Since we have no way of encoding these cases in our bitstream, we exit.
                        return Err(LeptonError::new(
                            ExitCode::InvalidResetCode,
                            format!(
                                "invalid reset {0:x} {1:x} code found in stream",
                                0xff, buffer[0]
                            ),
                        )
                        .into());
                    }
                } else {
                    self.bits = (self.bits << 8) | (b as u64);
                    self.bits_left += 8;
                }
            } else {
                // in case of a truncated file, we treat the rest of the file as zeros, but the
                // bits that were ok still get returned so that we get the partial last byte right
                // the caller periodically checks for EOF to see if it should stop encoding
                self.eof = true;
                self.bits_left += 8;
                self.bits <<= 8;

                // continue since we still might need to read more 0 bits
            }

            if self.bits_left >= bits_to_read {
                break;
            }
        }
        Ok(())
    }

    pub fn is_eof(&mut self) -> bool {
        return self.eof;
    }

    /// used to verify whether this image is using 1s or 0s as fill bits.
    /// Returns whether the fill bit was 1 or so or unknown (None)
    pub fn read_and_verify_fill_bits(
        &mut self,
        pad_bit: &mut Option<u8>,
    ) -> Result<(), LeptonError> {
        self.undo_read_ahead();

        // if there are bits left, we need to see whether they
        // are 1s or zeros.

        if (self.bits_left) > 0 && !self.eof {
            let num_bits_to_read = self.bits_left;
            let actual = self.read(num_bits_to_read)?;
            let all_one = (1 << num_bits_to_read) - 1;

            match *pad_bit {
                None => {
                    if actual == 0 {
                        *pad_bit = Some(0);
                    } else if actual == all_one {
                        *pad_bit = Some(0xff);
                    } else {
                        return err_exit_code(
                            ExitCode::InvalidPadding,
                            format!(
                                "inconsistent pad bits num_bits={0} pattern={1:b}",
                                num_bits_to_read, actual
                            ),
                        );
                    }
                }
                Some(x) => {
                    // if we already saw a padding, then it should match
                    let expected = u16::from(x) & all_one;
                    if actual != expected {
                        return err_exit_code(
                            ExitCode::InvalidPadding,
                            format!(
                                "padding of {0} bits should be set to 1 actual={1:b} expected={2:b}",
                                num_bits_to_read, actual, expected
                            ),
                        );
                    }
                }
            }
        }

        return Ok(());
    }

    pub fn verify_reset_code(&mut self) -> Result<(), LeptonError> {
        // we reached the end of a MCU, so we need to find a reset code and the huffman codes start get padded out, but the spec
        // doesn't specify whether the padding should be 1s or 0s, so we ensure that at least the file is consistant so that we
        // can recode it again just by remembering the pad bit.
        self.undo_read_ahead();

        let mut h = [0u8; 2];
        self.inner.read_exact(&mut h)?;
        if h[0] != 0xff || h[1] != (jpeg_code::RST0 + (self.cpos as u8 & 7)) {
            return err_exit_code(
                ExitCode::InvalidResetCode,
                format!("invalid reset code {0:x} {1:x} found in stream", h[0], h[1]),
            );
        }

        // start from scratch after RST
        self.cpos += 1;
        self.bits = 0;
        self.bits_left = 0;

        Ok(())
    }

    /// Retrieves the byte containing the next bit to be read in the stream, with only
    /// the bits that have already been read in it possibly set, and all the rest of the
    /// bits cleared.
    ///
    /// bitsAlreadyRead: the number of bits already read from the current byte
    /// byteBeingRead: the byte currently being read, with any bits not read from it yet cleared (0'ed)
    pub fn overhang(&mut self) -> (u8, u8) {
        self.undo_read_ahead();
        let bits_already_read = ((64 - self.bits_left) & 7) as u8; // already read bits in the current byte

        let mask = (((1 << bits_already_read) - 1) << (8 - bits_already_read)) as u8;

        return (bits_already_read, (self.bits as u8) & mask);
    }

    /// "puts back" read_ahead bits that were read ahead from the buffer but not consumed.
    ///
    /// This avoids special for many of the other non-speed-sensitive operations.
    ///
    /// After calling this method, we can be guaranteed that read_ahead_bytes is 0 and that
    /// the only bits that are left are part of the current byte.
    pub fn undo_read_ahead(&mut self) {
        while self.bits_left >= 8 && self.read_ahead_bytes > 0 {
            self.bits_left -= 8;
            self.bits >>= 8;
            self.read_ahead_bytes -= 1;
        }

        if self.read_ahead_bytes > 0 {
            self.inner.consume(self.read_ahead_bytes as usize);
            self.read_ahead_bytes = 0;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    // test reading a simple bit pattern with an escaped 0xff inside it.
    #[test]
    fn read_simple() {
        let arr = [0x12u8, 0x34, 0x45, 0x67, 0x89, 0xff, 00, 0xee];

        let mut b = BitReader::new(Cursor::new(&arr));

        assert_eq!(1, b.read(4).unwrap());
        assert_eq!((4, 0x10), b.overhang());
        assert_eq!(0, b.stream_position());

        assert_eq!(2, b.read(4).unwrap());
        assert_eq!((0, 0), b.overhang()); // byte is aligned should be no overhang
        assert_eq!(1, b.stream_position());

        assert_eq!(3, b.read(4).unwrap());
        assert_eq!(4, b.read(4).unwrap());
        assert_eq!(4, b.read(4).unwrap());
        assert_eq!(0x56, b.read(8).unwrap()); // 8 bits between 0x45 and 0x67
        assert_eq!(0x78, b.read(8).unwrap());

        assert_eq!(0x9f, b.read(8).unwrap());
        assert_eq!((4, 0xf0), b.overhang());
        assert_eq!(5, b.stream_position()); // should be at the beginning of the escape code

        assert_eq!(0xfe, b.read(8).unwrap());
        assert_eq!((4, 0xe0), b.overhang());
        assert_eq!(7, b.stream_position()); // now we are after the escape code

        assert_eq!(0xe, b.read(4).unwrap());
        assert_eq!((0, 0), b.overhang());
        assert_eq!(8, b.stream_position()); // now we read everything and should be at the end of the stream

        // read an empty byte passed the end of the stream.. should be zero and trigger EOF
        assert_eq!(0, b.read(8).unwrap());
        assert_eq!(true, b.is_eof());
        assert_eq!(8, b.stream_position()); // still at the same position
    }

    // what happens when a file has 0xff as the last character (assume that it is an escaped 0xff)
    #[test]
    fn read_truncate_ff() {
        let arr = [0x12u8, 0xff];

        let mut b = BitReader::new(Cursor::new(&arr));

        assert_eq!(0, b.stream_position());

        assert_eq!(0x1, b.read(4).unwrap());
        assert_eq!(0, b.stream_position());

        assert_eq!(0x2f, b.read(8).unwrap());
        assert_eq!((4, 0xf0), b.overhang());
        assert_eq!(1, b.stream_position());

        // 4 bits left, not EOF yet
        assert_eq!(false, b.is_eof());

        assert_eq!(0xf, b.read(4).unwrap());
        assert_eq!(false, b.is_eof()); // now we are at the end really
        assert_eq!(2, b.stream_position());

        assert_eq!(0, b.read(4).unwrap());
        assert_eq!(true, b.is_eof());
        assert_eq!(2, b.stream_position());
    }
}


================================================
FILE: lib/src/jpeg/bit_writer.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::mem;

use crate::helpers::has_ff;

pub struct BitWriter {
    data_buffer: Vec<u8>,
    fill_register: u64,
    current_bit: u32,
}

// use to write varying sized bits for coding JPEG. Escapes 0xff -> [0xff,0]
impl BitWriter {
    pub fn new(data_buffer: Vec<u8>) -> Self {
        return BitWriter {
            current_bit: 64,
            fill_register: 0,
            data_buffer,
        };
    }

    /// flushes whole bytes from the register into the data buffer
    fn flush_whole_bytes(&mut self) {
        while self.current_bit <= 56 {
            let b = (self.fill_register >> 56) as u8;
            if b != 0xff {
                self.data_buffer.push(b);
            } else {
                // escape 0xff here to avoid multiple scans of the same data
                self.data_buffer.extend_from_slice(&[0xff, 0]);
            }

            self.fill_register <<= 8;
            self.current_bit += 8;
        }
    }

    /// write data
    pub fn write_byte_unescaped(&mut self, b: u8) {
        assert!(self.current_bit == 64);
        self.data_buffer.push(b);
    }

    #[inline(always)]
    pub fn write(&mut self, val: u32, new_bits: u32) {
        /// this is the slow path that is rarely called but generates a lot of code inlined
        /// so we move it out of the main function to keep the main function small with few branches.
        ///
        /// We also call this path when we are about to overflow the buffer to avoid having
        /// to inline the buffer growing logic, which is also much bigger than a simple insert.
        #[inline(never)]
        #[cold]
        fn write_ff_encoded(data_buffer: &mut Vec<u8>, fill_register: u64) {
            for i in 0..8 {
                let b = (fill_register >> (56 - (i * 8))) as u8;
                if b != 0xff {
                    data_buffer.push(b);
                } else {
                    // escape 0xff here to avoid multiple scans of the same data
                    data_buffer.extend_from_slice(&[0xff, 0]);
                }
            }
        }

        debug_assert!(
            val < (1 << new_bits),
            "value {0} should fit into the number of {1} bits provided",
            val,
            new_bits
        );

        // first see if everything fits in the current register
        if new_bits <= self.current_bit {
            self.fill_register |= (val as u64).wrapping_shl(self.current_bit - new_bits); // support corner case where new_bits is zero, we don't want to panic
            self.current_bit = self.current_bit - new_bits;
        } else {
            // if not, fill up the register so to the 64 bit boundary we can flush it hopefully without any 0xff bytes
            let fill = self.fill_register | (val as u64).wrapping_shr(new_bits - self.current_bit);

            let leftover_new_bits = new_bits - self.current_bit;
            let leftover_val = val & (1 << leftover_new_bits) - 1;

            // flush bytes slowly if we have any 0xff bytes or if we are about to overflow the buffer
            // (overflow check matches implementation in RawVec so that the optimizer can remove the buffer growing code)
            if has_ff(fill)
                || self
                    .data_buffer
                    .capacity()
                    .wrapping_sub(self.data_buffer.len())
                    < 8
            {
                write_ff_encoded(&mut self.data_buffer, fill);
            } else {
                self.data_buffer.extend_from_slice(&fill.to_be_bytes());
            }

            self.fill_register = (leftover_val as u64).wrapping_shl(64 - leftover_new_bits); // support corner case where new_bits is zero, we don't want to panic
            self.current_bit = 64 - leftover_new_bits;
        }
    }

    pub fn pad(&mut self, fillbit: u8) {
        let mut offset = 1;
        while (self.current_bit & 7) != 0 {
            self.write(if (fillbit & offset) != 0 { 1 } else { 0 }, 1);
            offset <<= 1;
        }

        self.flush_whole_bytes();

        debug_assert!(
            self.current_bit == 64,
            "there should be no remainder after padding"
        );
    }

    // flushes the data buffer while escaping all 0xff characters
    pub fn detach_buffer(&mut self) -> Vec<u8> {
        // flush any remaining whole bytes
        self.flush_whole_bytes();

        mem::take(&mut self.data_buffer)
    }

    pub fn ensure_space(&mut self, amount: usize) {
        if self.data_buffer.capacity() < amount {
            let len = self.data_buffer.len();
            self.data_buffer.reserve(amount - len);
        }
    }

    pub fn reset_from_overhang_byte_and_num_bits(&mut self, overhang_byte: u8, num_bits: u32) {
        self.data_buffer.clear();

        self.fill_register = 0;
        self.fill_register = overhang_byte as u64;
        self.fill_register <<= 56;
        self.current_bit = 64 - num_bits;
    }

    pub fn has_no_remainder(&self) -> bool {
        return self.current_bit == 64;
    }

    pub fn amount_buffered(&self) -> usize {
        self.data_buffer.len()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use std::io::Cursor;

    use crate::helpers::u32_bit_length;
    use crate::jpeg::bit_reader::BitReader;

    // write a test pattern with an escape and see if it matches
    #[test]
    fn write_simple() {
        let arr = [0x12, 0x34, 0x45, 0x67, 0x89, 0xff, 00, 0xee];

        let mut b = BitWriter::new(Vec::with_capacity(1024));

        b.write(1, 4);
        b.write(2, 4);
        b.write(3, 4);
        b.write(4, 4);
        b.write(4, 4);
        b.write(0x56, 8);
        b.write(0x78, 8);
        b.write(0x9f, 8);
        b.write(0xfe, 8);
        b.write(0xe, 4);

        let w = b.detach_buffer();

        assert_eq!(w[..], arr);
    }

    // verify the the bits roundtrip correctly in a fairly simple scenario
    #[test]
    fn roundtrip_bits() {
        let buf;
        {
            let mut b = BitWriter::new(Vec::with_capacity(1024));
            for i in 1..2048 {
                b.write(i, u32_bit_length(i) as u32);
            }

            b.pad(0xff);

            buf = b.detach_buffer();
        }

        {
            let mut r = BitReader::new(Cursor::new(&buf));

            for i in 1..2048 {
                assert_eq!(i, r.read(u32_bit_length(i as u32) as u32).unwrap());
            }

            let mut pad = Some(0xff);
            r.read_and_verify_fill_bits(&mut pad).unwrap();
        }
    }

    /// verify the the bits roundtrip correctly with random bits
    #[test]
    fn roundtrip_randombits() {
        #[derive(Copy, Clone)]
        enum Action {
            Write(u16, u8),
            Pad(u8),
        }

        use rand::Rng;

        const ITERATIONS: usize = 10000;

        let mut rng = crate::helpers::get_rand_from_seed([0u8; 32]);
        let mut test_data = Vec::with_capacity(ITERATIONS);

        for _ in 0..ITERATIONS {
            let bits = rng.gen_range(0..=16);

            let t = rng.gen_range(0..=3);
            let v = match t {
                0 => 0,
                1 => 0xffff,
                _ => rng.gen_range(0..=65535),
            };

            let v = v & ((1 << bits) - 1);

            if rng.gen_range(0..100) == 0 {
                test_data.push(Action::Pad(0xff));
            } else {
                test_data.push(Action::Write(v as u16, bits as u8));
            }
        }
        test_data.push(Action::Pad(0xff));

        let buf;
        {
            let mut b = BitWriter::new(Vec::with_capacity(1024));
            for &i in &test_data {
                match i {
                    Action::Write(v, bits) => b.write(v as u32, bits as u32),
                    Action::Pad(fill) => b.pad(fill),
                }
            }

            buf = b.detach_buffer();
        }

        {
            let mut r = BitReader::new(Cursor::new(&buf));

            for a in test_data {
                match a {
                    Action::Write(code, numbits) => {
                        let expected_peek_byte = if numbits < 8 {
                            (code << (8 - numbits)) as u8
                        } else {
                            (code >> (numbits - 8)) as u8
                        };

                        let (peekcode, peekbits) = r.peek();
                        let num_valid_bits = peekbits.min(8).min(u32::from(numbits));

                        let mask = (0xff00 >> num_valid_bits) as u8;

                        assert_eq!(
                            expected_peek_byte & mask,
                            peekcode & mask,
                            "peek unexpected result"
                        );

                        assert_eq!(
                            code,
                            r.read(numbits as u32).unwrap(),
                            "read unexpected result"
                        );
                    }
                    Action::Pad(fill) => {
                        let mut pad = Some(fill);
                        r.read_and_verify_fill_bits(&mut pad).unwrap();
                    }
                }
            }
        }
    }
}


================================================
FILE: lib/src/jpeg/block_based_image.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use crate::lepton_error::{Result, err_exit_code};
use bytemuck::{cast, cast_ref};
use log::info;
use wide::{CmpEq, i16x8};

use crate::ExitCode;
use crate::consts::ZIGZAG_TO_TRANSPOSED;

use super::jpeg_header::JpegHeader;

/// holds the 8x8 blocks for a given component. Since we do multithreaded encoding,
/// the image may only hold a subset of the components (specified by dpos_offset),
/// but they can be merged
pub struct BlockBasedImage {
    block_width: u32,

    original_height: u32,

    dpos_offset: u32,

    image: Vec<AlignedBlock>,
}

static EMPTY: AlignedBlock = AlignedBlock { raw_data: [0; 64] };

impl BlockBasedImage {
    // constructs new block image for the given y-coordinate range
    pub fn new(
        jpeg_header: &JpegHeader,
        component: usize,
        luma_y_start: u32,
        luma_y_end: u32,
    ) -> Result<Self> {
        let block_width = jpeg_header.cmp_info[component].bch;
        let original_height = jpeg_header.cmp_info[component].bcv;
        let max_size = block_width * original_height;

        let image_capacity = usize::try_from(
            (u64::from(max_size) * u64::from(luma_y_end - luma_y_start)
                + u64::from(jpeg_header.cmp_info[0].bcv - 1 /* round up */))
                / u64::from(jpeg_header.cmp_info[0].bcv),
        )
        .unwrap();

        let dpos_offset = u32::try_from(
            u64::from(max_size) * u64::from(luma_y_start) / u64::from(jpeg_header.cmp_info[0].bcv),
        )
        .unwrap();

        let mut image = Vec::new();
        if let Err(e) = image.try_reserve_exact(image_capacity) {
            // If there is an out-of-memory, this is the most likely place to happen since this is the uncompressed
            // coefficient buffer.
            //
            // Handle out of memory errors gracefully, otherwise the default oom handler kills
            // the process.
            return err_exit_code(
                ExitCode::OutOfMemory,
                format!(
                    "failed to allocate block image of size {image_capacity} for component {component} with block width {block_width} and original height {original_height} (luma_y_start = {luma_y_start}, luma_y_end = {luma_y_end}) : {e}"
                ),
            );
        }

        return Ok(BlockBasedImage {
            block_width: block_width,
            original_height: original_height,
            image,
            dpos_offset: dpos_offset,
        });
    }

    /// merges a bunch of block images generated by different threads into a single one used by progressive decoding
    pub fn merge(images: &mut Vec<Vec<BlockBasedImage>>, index: usize) -> Result<Self> {
        // figure out the total size of all the blocks so we can set the capacity correctly
        let total_size = images.iter().map(|x| x[index].image.len()).sum();

        let mut contents = Vec::new();
        if let Err(e) = contents.try_reserve_exact(total_size) {
            // If there is an out-of-memory, this is the most likely place to happen since this is the uncompressed
            // coefficient buffer.
            //
            // Handle out of memory errors gracefully, otherwise the default oom handler kills
            // the process.
            return err_exit_code(
                ExitCode::OutOfMemory,
                format!("failed to allocate merged block image of size {total_size} : {e}"),
            );
        }

        let mut block_width = None;
        let mut original_height = None;

        for v in images {
            assert!(
                v[index].dpos_offset == contents.len() as u32,
                "previous content should match new content"
            );

            if let Some(w) = block_width {
                assert_eq!(w, v[index].block_width, "all block_width must match")
            } else {
                block_width = Some(v[index].block_width);
            }

            if let Some(w) = original_height {
                assert_eq!(
                    w, v[index].original_height,
                    "all original_height must match"
                )
            } else {
                original_height = Some(v[index].original_height);
            }

            contents.append(&mut v[index].image);
        }

        return Ok(BlockBasedImage {
            block_width: block_width.unwrap(),
            original_height: original_height.unwrap(),
            image: contents,
            dpos_offset: 0,
        });
    }

    #[allow(dead_code)]
    pub fn dump(&self) {
        info!(
            "size = {0}, capacity = {1}, dpos_offset = {2}",
            self.image.len(),
            self.image.capacity(),
            self.dpos_offset
        );
    }

    pub fn get_block_width(&self) -> u32 {
        self.block_width
    }

    pub fn get_original_height(&self) -> u32 {
        self.original_height
    }

    /// ensure that the image is filled up to a given dpos with blank blocks and optionally
    /// write a block at the given position.
    #[inline(always)]
    pub fn fill_up_to_dpos(
        &mut self,
        dpos: u32,
        block_to_write: Option<AlignedBlock>,
    ) -> &mut AlignedBlock {
        // ensure that dpos_offset got set to the right value when we start writing
        if self.image.len() == 0 {
            debug_assert!(self.dpos_offset == dpos);
        }

        // should never underflow otherwise we are writing to the wrong part of the image
        let relative_offset = (dpos as usize)
            .checked_sub(self.dpos_offset as usize)
            .unwrap();

        if relative_offset < self.image.len() {
            // rewrite already written block
            if let Some(b) = block_to_write {
                self.image[relative_offset] = b;
            }
        } else {
            // need to extend the image length and add any necessary
            // zero blocks to fill the gap.
            assert!(
                relative_offset < self.image.capacity(),
                "capacity should be set to the exact image size to avoid reallocations"
            );

            // optimizer realizes that this is memset
            self.image
                .resize_with(relative_offset, || AlignedBlock::default());

            self.image.push(block_to_write.unwrap_or_default());
        }

        return &mut self.image[relative_offset];
    }

    pub fn set_block_data(&mut self, dpos: u32, block_data: AlignedBlock) {
        self.fill_up_to_dpos(dpos, Some(block_data));
    }

    pub fn get_block(&self, dpos: u32) -> &AlignedBlock {
        if (dpos - self.dpos_offset) as usize >= self.image.len() {
            return &EMPTY;
        } else {
            return &self.image[(dpos - self.dpos_offset) as usize];
        }
    }

    #[inline(always)]
    pub fn append_block(&mut self, block: AlignedBlock) {
        assert!(
            self.image.len() < self.image.capacity(),
            "capacity should be set correctly"
        );
        self.image.push(block);
    }

    #[inline(always)]
    pub fn get_block_mut(&mut self, dpos: u32) -> &mut AlignedBlock {
        self.fill_up_to_dpos(dpos, None)
    }
}

/// block of 64 coefficients in the aligned order, which is similar to zigzag except that the 7x7 lower right square comes first,
/// followed by the DC, followed by the edges
#[repr(C, align(32))]
pub struct AlignedBlock {
    raw_data: [i16; 64],
}

pub static EMPTY_BLOCK: AlignedBlock = AlignedBlock { raw_data: [0; 64] };

impl Default for AlignedBlock {
    fn default() -> Self {
        AlignedBlock { raw_data: [0; 64] }
    }
}

impl AlignedBlock {
    #[inline(always)]
    pub fn new(block: [i16; 64]) -> Self {
        AlignedBlock { raw_data: block }
    }

    #[inline(always)]
    pub fn as_i16x8(&self, index: usize) -> i16x8 {
        let v: &[i16x8; 8] = cast_ref(&self.raw_data);
        v[index]
    }

    #[allow(dead_code)]
    #[inline(always)]
    pub fn transpose(&self) -> AlignedBlock {
        return AlignedBlock::new(cast(i16x8::transpose(cast(*self.get_block()))));
    }

    #[inline(always)]
    pub fn get_dc(&self) -> i16 {
        return self.raw_data[0];
    }

    #[inline(always)]
    pub fn set_dc(&mut self, value: i16) {
        self.raw_data[0] = value
    }

    #[inline(always)]
    pub fn zigzag_to_transposed(a: [i16; 64]) -> AlignedBlock {
        AlignedBlock {
            raw_data: [
                a[0], a[2], a[3], a[9], a[10], a[20], a[21], a[35], a[1], a[4], a[8], a[11], a[19],
                a[22], a[34], a[36], a[5], a[7], a[12], a[18], a[23], a[33], a[37], a[48], a[6],
                a[13], a[17], a[24], a[32], a[38], a[47], a[49], a[14], a[16], a[25], a[31], a[39],
                a[46], a[50], a[57], a[15], a[26], a[30], a[40], a[45], a[51], a[56], a[58], a[27],
                a[29], a[41], a[44], a[52], a[55], a[59], a[62], a[28], a[42], a[43], a[53], a[54],
                a[60], a[61], a[63],
            ],
        }
    }

    #[inline(always)]
    pub fn zigzag_from_transposed(&self) -> AlignedBlock {
        let a = self.raw_data;
        AlignedBlock {
            raw_data: [
                a[0], a[8], a[1], a[2], a[9], a[16], a[24], a[17], a[10], a[3], a[4], a[11], a[18],
                a[25], a[32], a[40], a[33], a[26], a[19], a[12], a[5], a[6], a[13], a[20], a[27],
                a[34], a[41], a[48], a[56], a[49], a[42], a[35], a[28], a[21], a[14], a[7], a[15],
                a[22], a[29], a[36], a[43], a[50], a[57], a[58], a[51], a[44], a[37], a[30], a[23],
                a[31], a[38], a[45], a[52], a[59], a[60], a[53], a[46], a[39], a[47], a[54], a[61],
                a[62], a[55], a[63],
            ],
        }
    }

    #[inline(always)]
    pub fn get_block(&self) -> &[i16; 64] {
        return &self.raw_data;
    }

    #[inline(always)]
    pub fn get_block_mut(&mut self) -> &mut [i16; 64] {
        return &mut self.raw_data;
    }

    // used for debugging
    #[allow(dead_code)]
    pub fn get_hash(&self) -> i32 {
        let mut sum = 0;
        for i in 0..64 {
            sum += self.raw_data[i] as i32
        }
        return sum;
    }

    #[inline(always)]
    pub fn get_count_of_non_zeros_7x7(&self) -> u8 {
        /// counts a row of non-zero values in the 7x7 block
        #[inline(always)]
        fn count_non_zeros_7x7_row(v: i16x8) -> i16x8 {
            !v.simd_eq(i16x8::ZERO) & i16x8::new([0, 1, 1, 1, 1, 1, 1, 1])
        }

        let mut sum = i16x8::ZERO;
        for i in 1..8 {
            sum += count_non_zeros_7x7_row(self.as_i16x8(i));
        }

        return sum.reduce_add() as u8;
    }

    #[inline(always)]
    pub fn get_coefficient(&self, index: usize) -> i16 {
        return self.raw_data[index];
    }

    #[inline(always)]
    pub fn set_coefficient(&mut self, index: usize, v: i16) {
        self.raw_data[index] = v;
    }

    #[inline(always)]
    pub fn set_transposed_from_zigzag(&mut self, index: usize, v: i16) {
        self.raw_data[usize::from(ZIGZAG_TO_TRANSPOSED[index])] = v;
    }

    #[inline(always)]
    pub fn get_transposed_from_zigzag(&self, index: usize) -> i16 {
        return self.raw_data[usize::from(ZIGZAG_TO_TRANSPOSED[index])];
    }

    #[inline(always)]
    pub fn from_stride(&self, offset: usize, stride: usize) -> i16x8 {
        return i16x8::new([
            self.raw_data[offset],
            self.raw_data[offset + (1 * stride)],
            self.raw_data[offset + (2 * stride)],
            self.raw_data[offset + (3 * stride)],
            self.raw_data[offset + (4 * stride)],
            self.raw_data[offset + (5 * stride)],
            self.raw_data[offset + (6 * stride)],
            self.raw_data[offset + (7 * stride)],
        ]);
    }
}


================================================
FILE: lib/src/jpeg/component_info.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

#[derive(Debug, Clone)]
pub struct ComponentInfo {
    /// quantization table
    pub q_table_index: u8,

    /// no of huffman table (DC)
    pub huff_dc: u8,

    /// no of huffman table (AC)
    pub huff_ac: u8,

    /// sample factor vertical
    pub sfv: u32,

    /// sample factor horizontal
    pub sfh: u32,

    /// blocks in mcu
    pub mbs: u32,

    /// block count vertical (interleaved)
    pub bcv: u32,

    /// block count horizontal (interleaved)
    pub bch: u32,

    /// block count (all) (interleaved)
    pub bc: u32,

    /// block count vertical (non interleaved)
    pub ncv: u32,

    /// block count horizontal (non interleaved)
    pub nch: u32,

    /// block count (all) (non interleaved)
    pub nc: u32,

    /// statistical identity
    pub sid: u32,

    /// jpeg internal id
    pub jid: u8,
}

impl Default for ComponentInfo {
    fn default() -> ComponentInfo {
        return ComponentInfo {
            q_table_index: 0xff,
            sfv: u32::MAX,
            sfh: u32::MAX,
            mbs: u32::MAX,
            bcv: u32::MAX,
            bch: u32::MAX,
            bc: u32::MAX,
            ncv: u32::MAX,
            nch: u32::MAX,
            nc: u32::MAX,
            sid: u32::MAX,
            jid: 0xff,
            huff_dc: 0xff,
            huff_ac: 0xff,
        };
    }
}


================================================
FILE: lib/src/jpeg/jpeg_code.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

/// Start of Frame (size information), coding process: baseline DCT
pub const SOF0: u8 = 0xC0;

/// Start of Frame (size information), coding process: extended sequential DCT
pub const SOF1: u8 = 0xC1;

/// Start of Frame (size information), coding process: progressive DCT
pub const SOF2: u8 = 0xC2;

/// Huffman Table
pub const DHT: u8 = 0xC4;

/// Restart 0 segment
pub const RST0: u8 = 0xD0;

/// Start of Image
pub const SOI: u8 = 0xD8;

/// End of Image, or End of File
pub const EOI: u8 = 0xD9;

/// Start of Scan
pub const SOS: u8 = 0xDA;

/// Define Quantization Table
pub const DQT: u8 = 0xDB;

/// Define restart interval
pub const DRI: u8 = 0xDD;


================================================
FILE: lib/src/jpeg/jpeg_header.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

/*
Copyright (c) 2006...2016, Matthias Stirner and HTW Aalen University
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use std::fmt::Debug;
use std::io::{Cursor, Read, Write};
use std::num::NonZeroU32;

use crate::LeptonError;
use crate::consts::JpegType;
use crate::enabled_features::EnabledFeatures;
use crate::helpers::*;
use crate::lepton_error::{AddContext, ExitCode, Result, err_exit_code};

use super::component_info::ComponentInfo;
use super::jpeg_code;
use super::truncate_components::TruncateComponents;

/// Information required to partition the coding the JPEG huffman encoded stream of a scan
/// at an arbitrary location in the stream.
///
/// Note that this only works for sequential JPEGs since progressive ones have multiple scans
/// that each process the entire image.

#[derive(Debug, Default, Clone)]
pub struct RestartSegmentCodingInfo {
    pub overhang_byte: u8,
    pub num_overhang_bits: u8,
    pub luma_y_start: u32,
    pub luma_y_end: u32,
    pub last_dc: [i16; 4],
}

impl RestartSegmentCodingInfo {
    pub fn new(
        overhang_byte: u8,
        num_overhang_bits: u8,
        last_dc: [i16; 4],
        mcu: u32,
        jf: &JpegHeader,
    ) -> Self {
        let mcu_y = mcu / jf.mcuh;
        let luma_mul = jf.cmp_info[0].bcv / jf.mcuv;

        Self {
            overhang_byte,
            num_overhang_bits,
            last_dc,
            luma_y_start: luma_mul * mcu_y,
            luma_y_end: luma_mul * (mcu_y + 1),
        }
    }
}

/// Global information required to reconstruct the JPEG exactly the way that it was, especially
/// regarding information about possible truncation and RST markers.
#[derive(Default, Clone, Debug)]
pub struct ReconstructionInfo {
    /// the maximum component in a truncated progressive image.
    ///
    /// This is meant to be used for progressive images but is not yet implemented.
    pub max_cmp: u32,

    /// the maximum band in a truncated progressive image
    ///
    /// This is meant to be used for progressive images but is not yet implemented.
    pub max_bpos: u32,

    /// The maximum bit in a truncated progressive image.
    ///
    /// This is meant to be used for progressive images but is not yet implemented.
    pub max_sah: u8,

    /// the maximum dpos in a truncated image
    pub max_dpos: [u32; 4],

    /// if we encountered EOF before the expected end of the image
    pub early_eof_encountered: bool,

    /// the mask for padding out the bitstream when we get to the end of a reset block
    pub pad_bit: Option<u8>,

    /// A list containing one entry for each scan segment. Each entry contains the number of restart intervals
    /// within the corresponding scan segment.
    ///
    /// TODO: We currently don't generate this value when we parse a JPEG (leaving rst_cnt_set as false), however when
    /// we read a Lepton file we will use this to determine whether we should generate restart markers in order
    /// to maintain backward compability for decoding Lepton files generated by the C++ version.
    ///
    /// This means that there might be some files that we could have encoded successfully that we don't, but since
    /// we are required to reverify anyway, this is not a problem (except a minor efficiency issue)
    pub rst_cnt: Vec<u32>,

    /// true if rst_cnt contains a valid set of counts
    pub rst_cnt_set: bool,

    /// information about how to truncate the image if it was partially written
    pub truncate_components: TruncateComponents,

    /// trailing RST marking information
    pub rst_err: Vec<u8>,

    /// raw jpeg header to be written back to the file when it is recreated
    pub raw_jpeg_header: Vec<u8>,

    /// garbage data (default value - empty segment - means no garbage data)
    pub garbage_data: Vec<u8>,
}

pub fn parse_jpeg_header<R: Read>(
    reader: &mut R,
    enabled_features: &EnabledFeatures,
    jpeg_header: &mut JpegHeader,
    rinfo: &mut ReconstructionInfo,
) -> Result<bool> {
    // the raw header in the lepton file can actually be spread across different sections
    // seperated by the Start-of-Scan marker. We use the mirror to write out whatever
    // data we parse until we hit the SOS

    let mut output = Vec::new();
    let mut output_cursor = Cursor::new(&mut output);

    let mut mirror = Mirror::new(reader, &mut output_cursor);

    if jpeg_header.parse(&mut mirror, enabled_features).context()? {
        // append the header if it was not the end of file marker
        rinfo.raw_jpeg_header.append(&mut output);
        return Ok(true);
    } else {
        // if the output was more than 2 bytes then was a trailing header, so keep that around as well,
        // but we don't want the EOI since that goes into the garbage data.
        if output.len() > 2 {
            rinfo.raw_jpeg_header.extend(&output[0..output.len() - 2]);
        }

        return Ok(false);
    }
}

// internal utility we use to collect the header that we read for later
struct Mirror<'a, R, W> {
    read: &'a mut R,
    output: &'a mut W,
    amount_written: usize,
}

impl<'a, R, W> Mirror<'a, R, W> {
    pub fn new(read: &'a mut R, output: &'a mut W) -> Self {
        Mirror {
            read,
            output,
            amount_written: 0,
        }
    }
}

impl<R: Read, W: Write> Read for Mirror<'_, R, W> {
    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        let n = self.read.read(buf)?;
        self.output.write_all(&buf[..n])?;
        self.amount_written += n;
        Ok(n)
    }
}

#[derive(Copy, Clone, Debug)]
pub(crate) struct HuffCodes {
    pub c_val: [u16; 256],
    pub c_len: [u16; 256],
    pub c_len_plus_s: [u8; 256],
    pub c_val_shift_s: [u32; 512],
    pub max_eob_run: u16,
}

impl Default for HuffCodes {
    fn default() -> Self {
        HuffCodes {
            c_val: [0; 256],
            c_len: [0; 256],
            c_len_plus_s: [0; 256],
            c_val_shift_s: [0; 512],
            max_eob_run: 0,
        }
    }
}

impl HuffCodes {
    /// Constructs from the format encoded by JPEG
    ///
    /// Tree consists of a 16 byte table with the number of codes for each bit length,
    /// followed by the actual codes for that length appended together.
    pub fn construct_from_segment(segment: &[u8]) -> Result<Self> {
        let clen_offset = 0;
        let cval_offset = 16;

        let mut hc = HuffCodes::default();

        // creating huffman-codes
        let mut k = 0;
        let mut code = 0;

        // symbol-value of code is its position in the table
        for i in 0..16 {
            ensure_space(segment, clen_offset, i + 1).context()?;

            let mut j = 0;
            while j < segment[clen_offset + (i & 0xff)] {
                ensure_space(segment, cval_offset, k + 1).context()?;

                let len = (1 + i) as u16;

                if u32::from(code) >= (1u32 << len) {
                    return err_exit_code(
                        ExitCode::UnsupportedJpeg,
                        "invalid huffman code layout, too many codes for a given length",
                    );
                }

                hc.c_len[usize::from(segment[cval_offset + (k & 0xff)])] = len;
                hc.c_val[usize::from(segment[cval_offset + (k & 0xff)])] = code;

                if code == 65535 {
                    return err_exit_code(ExitCode::UnsupportedJpeg, "huffman code too large");
                }

                k += 1;
                code += 1;
                j += 1;
            }

            code = code << 1;
        }

        hc.post_initialize();

        Ok(hc)
    }

    /// Code to run after initializing c_len and c_val
    /// Lookup tables used for fast encoding since we already
    /// know the length of the code and the value when we write
    /// the code + bits to the bitstream
    fn post_initialize(&mut self) {
        for i in 0..256 {
            let s = i & 0xf;
            self.c_len_plus_s[i] = (self.c_len[i] + (s as u16)) as u8;
            self.c_val_shift_s[i] = u32::from(self.c_val[i]) << s;

            // calculate the value for negative coefficients, which compensates for the sign bit
            self.c_val_shift_s[i + 256] = (u32::from(self.c_val[i]) << s) | ((1u32 << s) - 1);
        }

        // find out eobrun (runs of all zero blocks) max value. This is used encoding/decoding progressive files.
        //
        // G.1.2.2 of the spec specifies that there are 15 huffman codes
        // reserved for encoding long runs of up to 32767 empty blocks.
        // Here we figure out what the largest code that could possibly
        // be encoded by this table is so that we don't exceed it when
        // we reencode the file.
        self.max_eob_run = 0;

        let mut i: i32 = 14;
        while i >= 0 {
            if self.c_len[((i << 4) & 0xff) as usize] > 0 {
                self.max_eob_run = ((2 << i) - 1) as u16;
                break;
            }

            i -= 1;
        }
    }
}

#[derive(Copy, Clone, Debug)]
pub(crate) struct HuffTree {
    pub node: [[u16; 2]; 256],
    pub peek_code: [(u8, u8); 256],
}

impl Default for HuffTree {
    fn default() -> Self {
        HuffTree {
            node: [[0; 2]; 256],
            peek_code: [(0, 0); 256],
        }
    }
}

impl HuffTree {
    /// construct the huffman tree codes from the HuffCodes as a source
    pub fn construct_hufftree(hc: &HuffCodes, accept_invalid_dht: bool) -> Result<Self> {
        let mut ht = HuffTree::default();

        let mut nextfree = 1;
        for i in 0..256 {
            // reset current node
            let mut node = 0;

            // go through each code & store path
            if hc.c_len[i] > 0 {
                let mut j = hc.c_len[i] - 1;
                while j > 0 {
                    if node <= 0xff {
                        if bitn(hc.c_val[i], j) == 1 {
                            if ht.node[node][1] == 0 {
                                ht.node[node][1] = nextfree;
                                nextfree += 1;
                            }

                            node = usize::from(ht.node[node][1]);
                        } else {
                            if ht.node[node][0] == 0 {
                                ht.node[node][0] = nextfree;
                                nextfree += 1;
                            }

                            node = usize::from(ht.node[node][0]);
                        }
                    } else {
                        // we accept any .lep file that was encoded this way
                        if !accept_invalid_dht {
                            return err_exit_code(
                                ExitCode::UnsupportedJpeg,
                                "Huffman table out of space",
                            );
                        }
                    }

                    j -= 1;
                }
            }

            if node <= 0xff {
                // last link is number of targetvalue + 256
                if hc.c_len[i] > 0 {
                    if bitn(hc.c_val[i], 0) == 1 {
                        ht.node[node][1] = (i + 256) as u16;
                    } else {
                        ht.node[node][0] = (i + 256) as u16;
                    }
                }
            } else {
                // we accept any .lep file that was encoded this way
                if !accept_invalid_dht {
                    return err_exit_code(ExitCode::UnsupportedJpeg, "Huffman table out of space");
                }
            }
        }
        for x in &mut ht.node {
            if x[0] == 0 {
                x[0] = 0xffff;
            }
            if x[1] == 0 {
                x[1] = 0xffff;
            }
        }
        // initial value for next free place

        // work through every code creating links between the nodes (represented through ints)

        // for every illegal code node, store 0xffff we should never get here, but it will avoid an infinite loop in the case of a bug

        // precalculate decoding peeking into the stream. This lets us quickly decode
        // small code without jumping through the node table
        for peekbyte in 0..256 {
            let mut node = 0;
            let mut len: u8 = 0;

            while node < 256 && len <= 7 {
                node = ht.node[usize::from(node)][(peekbyte >> (7 - len)) & 0x1];

                len += 1;
            }

            if node == 0xffff || node < 256 {
                // invalid code or code was too long to fit, so just say it requireds 256 bits
                // so we will take the long path to decode it
                ht.peek_code[peekbyte] = (0, 0xff);
            } else {
                ht.peek_code[peekbyte] = ((node - 256) as u8, len);
            }
        }
        Ok(ht)
    }
}

/// JPEG information parsed out of segments found before the image segment
#[derive(Clone)]
pub struct JpegHeader {
    /// quantization tables 4 x 64
    pub q_tables: [[u16; 64]; 4],

    /// huffman codes (access via get_huff_xx_codes)
    h_codes: [[HuffCodes; 4]; 2],

    /// huffman decoding trees (access via get_huff_xx_tree)
    h_trees: [[HuffTree; 4]; 2],

    /// 1 if huffman table is set
    ht_set: [[u8; 4]; 2],

    /// components
    pub cmp_info: [ComponentInfo; 4],

    /// component count
    pub cmpc: usize,

    /// width of image
    pub img_width: u32,

    /// height of image
    pub img_height: u32,

    pub jpeg_type: JpegType,

    /// max horizontal sample factor
    pub sfhm: u32,

    /// max verical sample factor
    pub sfvm: u32,

    // mcus per line
    pub mcuv: NonZeroU32,

    /// mcus per column
    pub mcuh: NonZeroU32,

    /// count of mcus
    pub mcuc: u32,

    /// restart interval
    pub rsti: u32,

    /// component count in current scan
    pub cs_cmpc: usize,

    /// component numbers in current scan
    pub cs_cmp: [usize; 4],

    // variables: info about current scan
    /// begin - band of current scan ( inclusive )
    pub cs_from: u8,

    /// end - band of current scan ( inclusive )
    pub cs_to: u8,

    /// successive approximation bit pos high
    pub cs_sah: u8,

    /// successive approximation bit pos low
    pub cs_sal: u8,
}

impl std::fmt::Debug for JpegHeader {
    /// Custom debug implementation to avoid printing large arrays
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("JpegHeader")
            .field("cmp_info", &self.cmp_info)
            .field("cmpc", &self.cmpc)
            .field("img_width", &self.img_width)
            .field("img_height", &self.img_height)
            .field("jpeg_type", &self.jpeg_type)
            .field("sfhm", &self.sfhm)
            .field("sfvm", &self.sfvm)
            .field("mcuv", &self.mcuv)
            .field("mcuh", &self.mcuh)
            .field("mcuc", &self.mcuc)
            .field("rsti", &self.rsti)
            .field("cs_cmpc", &self.cs_cmpc)
            .field("cs_cmp", &self.cs_cmp)
            .field("cs_from", &self.cs_from)
            .field("cs_to", &self.cs_to)
            .field("cs_sah", &self.cs_sah)
            .field("cs_sal", &self.cs_sal)
            .finish()
    }
}

enum ParseSegmentResult {
    Continue,
    EOI,
    SOS,
}

impl Default for JpegHeader {
    fn default() -> Self {
        return JpegHeader {
            q_tables: [[0; 64]; 4],
            h_codes: [[HuffCodes::default(); 4]; 2],
            h_trees: [[HuffTree::default(); 4]; 2],
            ht_set: [[0; 4]; 2],
            cmp_info: [
                ComponentInfo::default(),
                ComponentInfo::default(),
                ComponentInfo::default(),
                ComponentInfo::default(),
            ],
            cmpc: 0,
            img_width: 0,
            img_height: 0,
            jpeg_type: JpegType::Unknown,
            sfhm: 0,
            sfvm: 0,
            mcuv: NonZeroU32::MIN,
            mcuh: NonZeroU32::MIN,
            mcuc: 0,
            rsti: 0,
            cs_cmpc: 0,
            cs_from: 0,
            cs_to: 0,
            cs_sah: 0,
            cs_sal: 0,
            cs_cmp: [0; 4],
        };
    }
}

impl JpegHeader {
    /// true if this image is a single scan, which can be partitioned and decode
    /// completely independently by separate threads. If this is not the case, then
    /// we need to decode the entire image in memory and then encode the JPEG sequentially.
    pub fn is_single_scan(&self) -> bool {
        assert!(self.jpeg_type != JpegType::Unknown);

        self.jpeg_type == JpegType::Sequential && self.cmpc == self.cs_cmpc
    }

    #[inline(always)]
    pub(super) fn get_huff_dc_codes(&self, cmp: usize) -> &HuffCodes {
        &self.h_codes[0][usize::from(self.cmp_info[cmp].huff_dc)]
    }

    #[inline(always)]
    pub(super) fn get_huff_dc_tree(&self, cmp: usize) -> &HuffTree {
        &self.h_trees[0][usize::from(self.cmp_info[cmp].huff_dc)]
    }

    #[inline(always)]
    pub(super) fn get_huff_ac_codes(&self, cmp: usize) -> &HuffCodes {
        &self.h_codes[1][usize::from(self.cmp_info[cmp].huff_ac)]
    }

    #[inline(always)]
    pub(super) fn get_huff_ac_tree(&self, cmp: usize) -> &HuffTree {
        &self.h_trees[1][usize::from(self.cmp_info[cmp].huff_ac)]
    }

    /// Parses JPEG segments and updates the appropriate header fields
    /// until we hit either an SOS (image data) or EOI (end of image).
    ///
    /// Returns false if we hit EOI, true if we have an image to process.
    pub fn parse<R: Read>(
        &mut self,
        reader: &mut R,
        enabled_features: &EnabledFeatures,
    ) -> Result<bool> {
        // header parser loop
        loop {
            match self
                .parse_next_segment(reader, enabled_features)
                .context()?
            {
                ParseSegmentResult::EOI => {
                    return Ok(false);
                }
                ParseSegmentResult::SOS => {
                    break;
                }
                _ => {}
            }
        }

        // check if information is complete
        if self.cmpc == 0 {
            return err_exit_code(
                ExitCode::UnsupportedJpeg,
                "header contains incomplete information",
            );
        }

        for cmp in 0..self.cmpc {
            if (self.cmp_info[cmp].sfv == 0)
                || (self.cmp_info[cmp].sfh == 0)
                || (self.q_tables[usize::from(self.cmp_info[cmp].q_table_index)][0] == 0)
                || (self.jpeg_type == JpegType::Unknown)
            {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    "header contains incomplete information (components)",
                );
            }
        }

        // do all remaining component info calculations
        for cmp in 0..self.cmpc {
            if self.cmp_info[cmp].sfh > self.sfhm {
                self.sfhm = self.cmp_info[cmp].sfh;
            }

            if self.cmp_info[cmp].sfv > self.sfvm {
                self.sfvm = self.cmp_info[cmp].sfv;
            }
        }

        self.mcuv = NonZeroU32::new(
            (1.0 * self.img_height as f64 / (8.0 * self.sfhm as f64)).ceil() as u32,
        )
        .ok_or_else(|| LeptonError::new(ExitCode::UnsupportedJpeg, "mcuv is zero"))?;

        self.mcuh =
            NonZeroU32::new((1.0 * self.img_width as f64 / (8.0 * self.sfvm as f64)).ceil() as u32)
                .ok_or_else(|| LeptonError::new(ExitCode::UnsupportedJpeg, "mcuh is zero"))?;

        self.mcuc = self.mcuv.get() * self.mcuh.get();

        for cmp in 0..self.cmpc {
            self.cmp_info[cmp].mbs = self.cmp_info[cmp].sfv * self.cmp_info[cmp].sfh;
            self.cmp_info[cmp].bcv = self.mcuv.get() * self.cmp_info[cmp].sfh;
            self.cmp_info[cmp].bch = self.mcuh.get() * self.cmp_info[cmp].sfv;
            self.cmp_info[cmp].bc = self.cmp_info[cmp].bcv * self.cmp_info[cmp].bch;
            self.cmp_info[cmp].ncv = (1.0
                * self.img_height as f64
                * (self.cmp_info[cmp].sfh as f64 / (8.0 * self.sfhm as f64)))
                .ceil() as u32;
            self.cmp_info[cmp].nch = (1.0
                * self.img_width as f64
                * (self.cmp_info[cmp].sfv as f64 / (8.0 * self.sfvm as f64)))
                .ceil() as u32;
            self.cmp_info[cmp].nc = self.cmp_info[cmp].ncv * self.cmp_info[cmp].nch;
        }

        // decide components' statistical ids
        if self.cmpc <= 3 {
            for cmp in 0..self.cmpc {
                self.cmp_info[cmp].sid = cmp as u32;
            }
        } else {
            for cmp in 0..self.cmpc {
                self.cmp_info[cmp].sid = 0;
            }
        }

        return Ok(true);
    }

    /// verifies that the huffman tables for the given types are present for the current scan, and if not, return an error
    pub fn verify_huffman_table(&self, dc_present: bool, ac_present: bool) -> Result<()> {
        for icsc in 0..self.cs_cmpc {
            let icmp = self.cs_cmp[icsc];

            if dc_present && self.ht_set[0][self.cmp_info[icmp].huff_dc as usize] == 0 {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    format!("DC huffman table missing for component {0}", icmp),
                );
            } else if ac_present && self.ht_set[1][self.cmp_info[icmp].huff_ac as usize] == 0 {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    format!("AC huffman table missing for component {0}", icmp),
                );
            }
        }

        Ok(())
    }

    // returns true we should continue parsing headers or false if we hit SOS and should stop
    fn parse_next_segment<R: Read>(
        &mut self,
        reader: &mut R,
        enabled_features: &EnabledFeatures,
    ) -> Result<ParseSegmentResult> {
        let mut header = [0u8; 4];

        if reader.read(&mut header[0..1]).context()? == 0 {
            // didn't get an EOI
            return Ok(ParseSegmentResult::EOI);
        }

        if header[0] != 0xff {
            return err_exit_code(ExitCode::UnsupportedJpeg, "invalid header encountered");
        }

        reader.read_exact(&mut header[1..2]).context()?;
        if header[1] == jpeg_code::EOI {
            return Ok(ParseSegmentResult::EOI);
        }

        // now read the second two bytes so we can get the size of the segment
        reader.read_exact(&mut header[2..]).context()?;

        let mut segment_data = Vec::new();

        let segment_size = b_short(header[2], header[3]);
        if segment_size < 2 {
            return err_exit_code(ExitCode::UnsupportedJpeg, "segment is too short");
        }

        segment_data.resize(usize::from(segment_size) - 2, 0);

        reader.read_exact(&mut segment_data).context()?;

        let mut hpos = 0;
        let len = segment_data.len();

        let segment = &segment_data[..];

        let btype = header[1];
        match btype
        {
            jpeg_code::DHT => // DHT segment
            {
                // build huffman trees & codes
                while hpos < len
                {
                    let lval = usize::from(lbits(segment[hpos], 4));
                    let rval = usize::from(rbits(segment[hpos], 4));
                    if (lval >= 2) || (rval >= 4)
                    {
                        break;
                    }

                    hpos+=1;

                    // build huffman codes & trees
                    self.h_codes[lval][rval] = HuffCodes::construct_from_segment(&segment[hpos..]).context()?;
                    self.h_trees[lval][rval] = HuffTree::construct_hufftree(&self.h_codes[lval][rval], enabled_features.accept_invalid_dht).context()?;
                    self.ht_set[lval][rval] = 1;

                    let mut skip = 16;

                    ensure_space(segment,hpos, 16)?;

                    for i in 0..16
                    {
                        skip += usize::from(segment[hpos + i]);
                    }

                    hpos += skip;
                }

                if hpos != len
                {
                    // if we get here, something went wrong
                    return err_exit_code(ExitCode::UnsupportedJpeg,"size mismatch in dht marker");
                }
            }

            jpeg_code::DQT => // DQT segment
            {
                // copy quantization tables to internal memory
                while hpos < len
                {
                    let lval = usize::from(lbits(segment[hpos], 4));
                    let rval = usize::from(rbits(segment[hpos], 4));
                    if lval >= 2 || rval >= 4
                    {
                        return err_exit_code(ExitCode::UnsupportedJpeg,"DQT has invalid index");
                    }

                    hpos+=1;
                    if lval == 0
                    {
                        ensure_space(segment,hpos, 64).context()?;

                        // 8 bit precision
                        for i in 0..64
                        {
                            self.q_tables[rval][i] = segment[hpos + i] as u16;
                            if self.q_tables[rval][i] == 0
                            {
                                if enabled_features.reject_dqts_with_zeros
                                {
                                    return err_exit_code(ExitCode::UnsupportedJpegWithZeroIdct0,"DQT has zero value");
                                }
                                else {
                                    break;
                                }
                            }
                        }

                        hpos += 64;
                    }
                    else
                    {
                        ensure_space(segment,hpos, 128).context()?;

                        // 16 bit precision
                        for i in 0..64
                        {
                            self.q_tables[rval][i] = b_short(segment[hpos + (2 * i)], segment[hpos + (2 * i) + 1]);
                            if self.q_tables[rval][i] == 0
                            {
                                if enabled_features.reject_dqts_with_zeros
                                {
                                    return err_exit_code(ExitCode::UnsupportedJpegWithZeroIdct0,"DQT has zero value");
                                }
                                else {
                                    break;
                                }
                            }
                        }

                        hpos += 128;
                    }
                }

                if hpos != len
                {
                    // if we get here, something went wrong
                    return err_exit_code(ExitCode::UnsupportedJpeg, "size mismatch in dqt marker");
                }

            }

            jpeg_code::DRI =>
            {  // DRI segment
                // define restart interval
                ensure_space(segment,hpos, 2).context()?;
                self.rsti = u32::from(b_short(segment[hpos], segment[hpos + 1]));
            }

            jpeg_code::SOS => // SOS segment
            {
                // prepare next scan
                ensure_space(segment,hpos, 1).context()?;

                self.cs_cmpc = usize::from(segment[hpos]);

                if self.cs_cmpc == 0
                {
                    return err_exit_code( ExitCode::UnsupportedJpeg, "zero components in scan");
                }

                if self.cs_cmpc > self.cmpc
                {
                    return err_exit_code( ExitCode::UnsupportedJpeg, format!("{0} components in scan, only {1} are allowed", self.cs_cmpc, self.cmpc));
                }

                hpos+=1;
                for i in 0..self.cs_cmpc
                {
                    ensure_space(segment,hpos, 2).context()?;

                    let mut cmp = 0;
                    while cmp < self.cmpc && segment[hpos] != self.cmp_info[cmp].jid
                    {
                        cmp+=1;
                    }

                    if cmp == self.cmpc
                    {
                        return err_exit_code(ExitCode::UnsupportedJpeg, "component id mismatch in start-of-scan");
                    }

                    self.cs_cmp[i] = cmp;
                    self.cmp_info[cmp].huff_dc = lbits(segment[hpos + 1], 4);
                    self.cmp_info[cmp].huff_ac = rbits(segment[hpos + 1], 4);

                    if (self.cmp_info[cmp].huff_dc == 0xff) || (self.cmp_info[cmp].huff_dc >= 4) ||
                        (self.cmp_info[cmp].huff_ac == 0xff) || (self.cmp_info[cmp].huff_ac >= 4)
                    {
                        return err_exit_code(ExitCode::UnsupportedJpeg,"huffman table number mismatch");
                    }

                    hpos += 2;
                }

                ensure_space(segment,hpos, 3).context()?;

                self.cs_from = segment[hpos + 0];
                self.cs_to = segment[hpos + 1];
                self.cs_sah = lbits(segment[hpos + 2], 4);
                self.cs_sal = rbits(segment[hpos + 2], 4);

                // check for errors
                if (self.cs_from > self.cs_to) || (self.cs_from > 63) || (self.cs_to > 63)
                {
                    return err_exit_code(ExitCode::UnsupportedJpeg,"spectral selection parameter out of range");
                }

                if (self.cs_sah >= 12) || (self.cs_sal >= 12)
                {
                    return err_exit_code(ExitCode::UnsupportedJpeg, "successive approximation parameter out of range");
                }

                return Ok(ParseSegmentResult::SOS);
            }

            jpeg_code::SOF0| // SOF0 segment, coding process: baseline DCT
            jpeg_code::SOF1| // SOF1 segment, coding process: extended sequential DCT
            jpeg_code::SOF2 =>  // SOF2 segment, coding process: progressive DCT
            {
                if self.jpeg_type != JpegType::Unknown
                {
                    return err_exit_code(ExitCode::UnsupportedJpeg, "image cannot have multiple SOF blocks");
                }

                // set JPEG coding type
                if btype == jpeg_code::SOF2
                {
                    self.jpeg_type = JpegType::Progressive;
                }
                else
                {
                    self.jpeg_type = JpegType::Sequential;
                }

                ensure_space(segment,hpos, 6).context()?;

                // check data precision, only 8 bit is allowed
                let lval = segment[hpos];
                if lval != 8
                {
                    return err_exit_code(ExitCode::UnsupportedJpeg, format!("{0} bit data precision is not supported", lval));
                }

                // image size, height & component count
                self.img_height = u32::from(b_short(segment[hpos + 1], segment[hpos + 2]));
                self.img_width = u32::from(b_short(segment[hpos + 3], segment[hpos + 4]));

                if self.img_height == 0 || self.img_width == 0
                {
                    return err_exit_code(ExitCode::UnsupportedJpeg, "image dimensions can't be zero");
                }

                if self.img_height > enabled_features.max_jpeg_height || self.img_width > enabled_features.max_jpeg_width
                {
                    return err_exit_code(ExitCode::UnsupportedJpeg, format!("image dimensions larger than {0}x{1}", enabled_features.max_jpeg_width, enabled_features.max_jpeg_height));
                }

                self.cmpc = usize::from(segment[hpos + 5]);

                if self.cmpc > 4
                {
                    return err_exit_code(ExitCode::UnsupportedJpeg, format!("image has {0} components, max 4 are supported", self.cmpc));
                }

                hpos += 6;

                // components contained in image
                for cmp in  0..self.cmpc
                {
                    ensure_space(segment,hpos, 3).context()?;

                    self.cmp_info[cmp].jid = segment[hpos];
                    self.cmp_info[cmp].sfv = u32::from(lbits(segment[hpos + 1], 4));
                    self.cmp_info[cmp].sfh = u32::from(rbits(segment[hpos + 1], 4));

                    if self.cmp_info[cmp].sfv > 2 || self.cmp_info[cmp].sfh > 2
                    {
                        return err_exit_code(ExitCode::SamplingBeyondTwoUnsupported, "Sampling type beyond to not supported");
                    }

                    let quantization_table_value = segment[hpos + 2];
                    if usize::from(quantization_table_value) >= self.q_tables.len()
                    {
                        return err_exit_code(ExitCode::UnsupportedJpeg,"quantizationTableValue too big");
                    }

                    self.cmp_info[cmp].q_table_index = quantization_table_value;
                    hpos += 3;
                }

            }

            0xC3 => // SOF3 segment
                {
                    // coding process: lossless sequential
                    return err_exit_code(ExitCode::UnsupportedJpeg,"sof3 marker found, image is coded lossless");
                }

            0xC5 => // SOF5 segment
                {
                    // coding process: differential sequential DCT
                    return err_exit_code(ExitCode::UnsupportedJpeg,"sof5 marker found, image is coded diff. sequential");
                }

            0xC6 => // SOF6 segment
                {
                    // coding process: differential progressive DCT
                    return err_exit_code(ExitCode::UnsupportedJpeg,"sof6 marker found, image is coded diff. progressive");
                }

            0xC7 => // SOF7 segment
                {
                    // coding process: differential lossless
                    return err_exit_code(ExitCode::UnsupportedJpeg,"sof7 marker found, image is coded diff. lossless");
                }

            0xC9 => // SOF9 segment
                {
                    // coding process: arithmetic extended sequential DCT
                    return err_exit_code(ExitCode::UnsupportedJpeg, "sof9 marker found, image is coded arithm. sequential");
                }

            0xCA => // SOF10 segment
                {
                    // coding process: arithmetic extended sequential DCT
                    return err_exit_code(ExitCode::UnsupportedJpeg, "sof10 marker found, image is coded arithm. progressive");
                }

            0xCB => // SOF11 segment
                {
                    // coding process: arithmetic extended sequential DCT
                    return err_exit_code(ExitCode::UnsupportedJpeg, "sof11 marker found, image is coded arithm. lossless");
                }

            0xCD => // SOF13 segment
                {
                    // coding process: arithmetic differntial sequential DCT
                    return err_exit_code(ExitCode::UnsupportedJpeg, "sof13 marker found, image is coded arithm. diff. sequential");
                }

            0xCE => // SOF14 segment
                {
                    // coding process: arithmetic differential progressive DCT
                    return err_exit_code(ExitCode::UnsupportedJpeg, "sof14 marker found, image is coded arithm. diff. progressive");
                }

            0xCF => // SOF15 segment
                {
                    // coding process: arithmetic differntial lossless
                    return err_exit_code(ExitCode::UnsupportedJpeg, "sof15 marker found, image is coded arithm. diff. lossless");
                }

            0xE0| // APP0 segment
            0xE1| // APP1 segment
            0xE2| // APP2 segment
            0xE3| // APP3 segment
            0xE4| // APP4 segment
            0xE5| // APP5 segment
            0xE6| // APP6 segment
            0xE7| // APP7 segment
            0xE8| // APP8 segment
            0xE9| // APP9 segment
            0xEA| // APP10 segment
            0xEB| // APP11 segment
            0xEC| // APP12segment
            0xED| // APP13 segment
            0xEE| // APP14 segment
            0xEF| // APP15 segment
            0xFE // COM segment
                // do nothing - return
                => {}

            jpeg_code::RST0| // RST0 segment
            0xD1| // RST1 segment
            0xD2| // RST2 segment
            0xD3| // RST3 segment
            0xD4| // RST4 segment
            0xD5| // RST5 segment
            0xD6| // RST6 segment
            0xD7 => // RST7 segment
                {
                    // return errormessage - RST is out of place here
                    return err_exit_code(ExitCode::UnsupportedJpeg, "rst marker found out of place");
                }

            jpeg_code::SOI => // SOI segment
                {
                    // return errormessage - start-of-image is out of place here
                    return err_exit_code(ExitCode::UnsupportedJpeg, "soi marker found out of place");
                }

            jpeg_code::EOI => // EOI segment
                {
                    // return errormessage - end-of-image is out of place here
                    return err_exit_code(ExitCode::UnsupportedJpeg,"eoi marker found out of place");
                }

            _ => // unknown marker segment
                {
                    // return errormessage - unknown marker
                    return err_exit_code(ExitCode::UnsupportedJpeg, format!("unknown marker found: FF {0:X}", btype));
                }
        }
        return Ok(ParseSegmentResult::Continue);
    }
}

fn ensure_space(segment: &[u8], hpos: usize, amount: usize) -> Result<()> {
    if hpos + amount > segment.len() {
        return err_exit_code(ExitCode::UnsupportedJpeg, "SOF too small");
    }

    Ok(())
}

/// constructs a huffman table for testing purposes from a given distribution
#[cfg(any(test, feature = "micro_benchmark"))]
pub(super) fn generate_huff_table_from_distribution(freq: &[usize; 256]) -> HuffCodes {
    use std::collections::{BinaryHeap, HashMap};

    struct Node {
        symbol: Option<u8>,
        freq: usize,
        left: Option<Box<Node>>,
        right: Option<Box<Node>>,
    }

    impl PartialOrd for Node {
        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
            Some(self.freq.cmp(&other.freq).reverse())
        }
    }

    impl PartialEq for Node {
        fn eq(&self, other: &Self) -> bool {
            self.freq == other.freq
        }
    }

    impl Eq for Node {}

    impl Ord for Node {
        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
            self.freq.cmp(&other.freq).reverse()
        }
    }

    fn build_tree(freq: &[usize]) -> Box<Node> {
        let mut pq = BinaryHeap::new();

        for (symbol, &freq) in freq.iter().enumerate() {
            if freq > 0 {
                pq.push(Box::new(Node {
                    symbol: Some(symbol as u8),
                    freq,
                    left: None,
                    right: None,
                }));
            }
        }

        while pq.len() > 1 {
            let left = pq.pop().unwrap();
            let right = pq.pop().unwrap();
            let new_node = Node {
                symbol: None,
                freq: left.freq + right.freq,
                left: Some(left),
                right: Some(right),
            };
            pq.push(Box::new(new_node));
        }

        pq.pop().unwrap()
    }

    fn generate_codes(root: &Node, codes: &mut HashMap<u8, (u16, u8)>, prefix: u16, length: u8) {
        if let Some(symbol) = root.symbol {
            codes.insert(symbol, (prefix, length));
        } else {
            if let Some(ref left) = root.left {
                generate_codes(left, codes, prefix << 1, length + 1);
            }
            if let Some(ref right) = root.right {
                generate_codes(right, codes, (prefix << 1) | 1, length + 1);
            }
        }
    }

    let root = build_tree(freq);

    let mut codes = HashMap::new();
    generate_codes(&root, &mut codes, 0, 0);

    let mut retval = HuffCodes::default();

    for (&symbol, &(code, length)) in &codes {
        retval.c_len[symbol as usize] = length.into();
        retval.c_val[symbol as usize] = code;
    }

    retval.post_initialize();

    retval
}


================================================
FILE: lib/src/jpeg/jpeg_position_state.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use crate::consts::{JpegDecodeStatus, JpegType};
use crate::lepton_error::{AddContext, ExitCode, err_exit_code};
use crate::{LeptonError, Result};

use super::jpeg_header::{HuffCodes, JpegHeader};

/// used to keep track of position while encoding or decoding a jpeg
pub struct JpegPositionState {
    /// current component
    cmp: usize,

    /// current minimum coded unit (a fraction of dpos)
    mcu: u32,

    /// index of component
    csc: usize,

    /// offset within mcu
    sub: u32,

    /// current block position in image for this component
    dpos: u32,

    /// number of blocks left until reset interval
    rstw: u32,

    /// tracks long zero byte runs in progressive images
    pub eobrun: u16,

    /// if the previous value was also an eobrun then this is used to make sure
    /// that we don't have two non-maximum value runs in a row that we wouldn't be
    /// able to recode exactly the same way
    pub prev_eobrun: u16,
}

impl JpegPositionState {
    pub fn new(jf: &JpegHeader, mcu: u32) -> Self {
        let cmp = jf.cs_cmp[0];
        let mcumul = jf.cmp_info[cmp].sfv * jf.cmp_info[cmp].sfh;

        let state = JpegPositionState {
            cmp,
            mcu,
            csc: 0,
            sub: 0,
            dpos: mcu * mcumul,
            rstw: if jf.rsti != 0 {
                jf.rsti - (mcu % jf.rsti)
            } else {
                0
            },
            eobrun: 0,
            prev_eobrun: 0,
        };
        return state;
    }

    pub fn get_mcu(&self) -> u32 {
        self.mcu
    }
    pub fn get_dpos(&self) -> u32 {
        self.dpos
    }
    pub fn get_cmp(&self) -> usize {
        self.cmp
    }

    pub fn get_cumulative_reset_markers(&self, jf: &JpegHeader) -> u32 {
        if self.rstw != 0 {
            self.get_mcu() / jf.rsti
        } else {
            0
        }
    }

    pub fn reset_rstw(&mut self, jf: &JpegHeader) {
        self.rstw = jf.rsti;

        // eobruns don't span reset intervals
        self.prev_eobrun = 0;
    }

    /// calculates next position (non interleaved)
    fn next_mcu_pos_noninterleaved(&mut self, jf: &JpegHeader) -> JpegDecodeStatus {
        // increment position
        self.dpos += 1;

        let cmp_info = &jf.cmp_info[self.cmp];

        // fix for non interleaved mcu - horizontal
        if cmp_info.bch != cmp_info.nch && self.dpos % cmp_info.bch == cmp_info.nch {
            self.dpos += cmp_info.bch - cmp_info.nch;
        }

        // fix for non interleaved mcu - vertical
        if cmp_info.bcv != cmp_info.ncv && self.dpos / cmp_info.bch == cmp_info.ncv {
            self.dpos = cmp_info.bc;
        }

        // now we've updated dpos, update the current MCU to be a fraction of that
        if jf.jpeg_type == JpegType::Sequential {
            self.mcu = self.dpos / (cmp_info.sfv * cmp_info.sfh);
        }

        // check position
        if self.dpos >= cmp_info.bc {
            return JpegDecodeStatus::ScanCompleted;
        } else if jf.rsti > 0 {
            self.rstw -= 1;
            if self.rstw == 0 {
                return JpegDecodeStatus::RestartIntervalExpired;
            }
        }

        return JpegDecodeStatus::DecodeInProgress;
    }

    /// calculates next position for MCU
    pub fn next_mcu_pos(&mut self, jf: &JpegHeader) -> JpegDecodeStatus {
        // if there is just one component, go the simple route
        if jf.cs_cmpc == 1 {
            return self.next_mcu_pos_noninterleaved(jf);
        }

        let mut sta = JpegDecodeStatus::DecodeInProgress; // status
        let local_mcuh = jf.mcuh.get();
        let mut local_mcu = self.mcu;
        let mut local_cmp = self.cmp;

        // increment all counts where needed
        self.sub += 1;
        let mut local_sub = self.sub;
        if local_sub >= jf.cmp_info[local_cmp].mbs {
            self.sub = 0;
            local_sub = 0;

            self.csc += 1;

            if self.csc >= jf.cs_cmpc {
                self.csc = 0;
                self.cmp = jf.cs_cmp[0];
                local_cmp = self.cmp;

                self.mcu += 1;

                local_mcu = self.mcu;

                if local_mcu >= jf.mcuc {
                    sta = JpegDecodeStatus::ScanCompleted;
                } else if jf.rsti > 0 {
                    self.rstw -= 1;
                    if self.rstw == 0 {
                        sta = JpegDecodeStatus::RestartIntervalExpired;
                    }
                }
            } else {
                self.cmp = jf.cs_cmp[self.csc];
                local_cmp = self.cmp;
            }
        }

        let sfh = jf.cmp_info[local_cmp].sfh;
        let sfv = jf.cmp_info[local_cmp].sfv;

        // get correct position in image ( x & y )
        if sfh > 1 {
            // to fix mcu order
            let mcu_over_mcuh = local_mcu / local_mcuh;
            let sub_over_sfv = local_sub / sfv;
            let mcu_mod_mcuh = local_mcu - (mcu_over_mcuh * local_mcuh);
            let sub_mod_sfv = local_sub - (sub_over_sfv * sfv);
            let mut local_dpos = (mcu_over_mcuh * sfh) + sub_over_sfv;

            local_dpos *= jf.cmp_info[local_cmp].bch;
            local_dpos += (mcu_mod_mcuh * sfv) + sub_mod_sfv;

            self.dpos = local_dpos;
        } else if sfv > 1 {
            // simple calculation to speed up things if simple fixing is enough
            self.dpos = (local_mcu * jf.cmp_info[local_cmp].mbs) + local_sub;
        } else {
            // no calculations needed without subsampling
            self.dpos = self.mcu;
        }

        return sta;
    }

    /// skips the eobrun, calculates next position
    pub fn skip_eobrun(&mut self, jf: &JpegHeader) -> Result<JpegDecodeStatus> {
        assert!(jf.cs_cmpc == 1, "this code only works for non-interleved");

        if (self.eobrun) == 0 {
            return Ok(JpegDecodeStatus::DecodeInProgress);
        }

        // compare rst wait counter if needed
        if jf.rsti > 0 {
            if u32::from(self.eobrun) > self.rstw {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    "skip_eobrun: eob run extends passed end of reset interval",
                )
                .context();
            } else {
                self.rstw -= u32::from(self.eobrun);
            }
        }

        fn checked_add(a: u32, b: u32) -> Result<u32> {
            a.checked_add(b)
                .ok_or_else(|| LeptonError::new(ExitCode::UnsupportedJpeg, "integer overflow"))
        }

        let cmp_info = &jf.cmp_info[self.cmp];

        // fix for non interleaved mcu - horizontal
        if cmp_info.bch != cmp_info.nch {
            self.dpos = checked_add(
                self.dpos,
                (((self.dpos % cmp_info.bch) + u32::from(self.eobrun)) / cmp_info.nch)
                    * (cmp_info.bch - cmp_info.nch),
            )
            .context()?;
        }

        // fix for non interleaved mcu - vertical
        if cmp_info.bcv != cmp_info.ncv && self.dpos / cmp_info.bch >= cmp_info.ncv {
            self.dpos =
                checked_add(self.dpos, (cmp_info.bcv - cmp_info.ncv) * cmp_info.bch).context()?;
        }

        // skip blocks
        self.dpos = checked_add(self.dpos, u32::from(self.eobrun)).context()?;

        // reset eobrun
        self.eobrun = 0;

        // check position to see if we are done decoding
        if self.dpos == cmp_info.bc {
            Ok(JpegDecodeStatus::ScanCompleted)
        } else if self.dpos > cmp_info.bc {
            err_exit_code(
                ExitCode::UnsupportedJpeg,
                "skip_eobrun: position extended passed block count",
            )
            .context()
        } else if jf.rsti > 0 && self.rstw == 0 {
            Ok(JpegDecodeStatus::RestartIntervalExpired)
        } else {
            Ok(JpegDecodeStatus::DecodeInProgress)
        }
    }

    /// checks to see if the we have optimal eob runs (each eobrun is as large as it legally can be) otherwise
    /// we will not know how to reencode the file since the encoder always assumes EOB runs as large as possible
    pub fn check_optimal_eobrun(
        &mut self,
        is_current_block_empty: bool,
        hc: &HuffCodes,
    ) -> Result<()> {
        // if we got an empty block, make sure that the previous zero run was as high as it could be
        // otherwise we won't reencode the file in the same way
        if is_current_block_empty {
            if self.prev_eobrun > 0 && self.prev_eobrun < hc.max_eob_run - 1 {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    format!(
                        "non optimal eobruns not supported (could have encoded up to {0} zero runs in a row, but only did {1} followed by {2}",
                        hc.max_eob_run,
                        self.prev_eobrun + 1,
                        self.eobrun + 1
                    ),
                );
            }
        }

        self.prev_eobrun = self.eobrun;

        Ok(())
    }
}


================================================
FILE: lib/src/jpeg/jpeg_read.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

/*
Copyright (c) 2006...2016, Matthias Stirner and HTW Aalen University
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use std::cmp::{self, max};
use std::io::{BufRead, Read, Seek, SeekFrom};

use crate::helpers::*;
use crate::lepton_error::{AddContext, ExitCode, Result, err_exit_code};
use crate::{EnabledFeatures, consts::*};

use super::bit_reader::BitReader;
use super::block_based_image::{AlignedBlock, BlockBasedImage};
use super::jpeg_code;
use super::jpeg_header::{
    HuffTree, JpegHeader, ReconstructionInfo, RestartSegmentCodingInfo, parse_jpeg_header,
};
use super::jpeg_position_state::JpegPositionState;

/// Reads a JPEG file from the provided reader and returns the image data. This function is
/// designed to return all the information needed to reconstruct a bit-level identical
/// JPEG file.
///
/// In some cases this will not be possible, for example if a JPEG contains certain coding errors
/// that are non-standard, in which case the function will return an error. This doesn't mean the JPEG
/// is corrupt, just that it is not supported for identical reconstruction.
///
/// The function returns the image data as a vector of `BlockBasedImage`, which contain the
/// DCT coefficients for each block in the image (we do not perform inverse DCT, this would be lossy).
/// In addition, we return a vector of `RestartSegmentCodingInfo` which contains the information
/// needed to reconstruct a portion of the JPEG file starting at the given offset. This is useful
/// for baseline images where we can split the image into sections and decode them in parallel.
///
/// The callback function is called with the JPEG header information after it has been parsed, and
/// is useful for debugging or logging purposes. Progressive images will contain multiple scans and
/// call the callback multiple times.
///
/// Non-progressive images support the idea of truncating the image (since this happens frequently)
/// where the bitstream is cut off at an arbitrary point. We assume that all subsequent data is zero,
/// but remember enough to reconstruct the bitstream until there.
///
/// There is also the concept of "garbage data" which is what comes after the scan data but is not
/// recognized as a header. This garbage data should be appeneded to the end of the file.
pub fn read_jpeg_file<R: BufRead + Seek, FN: FnMut(&JpegHeader, &[u8])>(
    reader: &mut R,
    jpeg_header: &mut JpegHeader,
    rinfo: &mut ReconstructionInfo,
    enabled_features: &EnabledFeatures,
    mut on_header_callback: FN,
) -> Result<(
    Vec<BlockBasedImage>,
    Vec<(u64, RestartSegmentCodingInfo)>,
    u64,
)> {
    let mut startheader = [0u8; 2];
    reader.read(&mut startheader)?;
    if startheader[0] != 0xFF || startheader[1] != jpeg_code::SOI {
        return err_exit_code(
            ExitCode::UnsupportedJpeg,
            "jpeg must start with with 0xff 0xd8",
        );
    }

    if !prepare_to_decode_next_scan(jpeg_header, rinfo, reader, enabled_features).context()? {
        return err_exit_code(ExitCode::UnsupportedJpeg, "Jpeg does not contain scans");
    }

    on_header_callback(jpeg_header, &rinfo.raw_jpeg_header);

    if !enabled_features.progressive && !jpeg_header.is_single_scan() {
        return err_exit_code(
            ExitCode::ProgressiveUnsupported,
            "file is progressive or contains multiple scans, but this is disabled",
        )
        .context();
    }

    if jpeg_header.cmpc > COLOR_CHANNEL_NUM_BLOCK_TYPES {
        return err_exit_code(
            ExitCode::Unsupported4Colors,
            "doesn't support 4 color channels",
        )
        .context();
    }

    rinfo.truncate_components.init(jpeg_header);
    let mut image_data = Vec::<BlockBasedImage>::new();
    for i in 0..jpeg_header.cmpc {
        // constructor takes height in proportion to the component[0]
        image_data.push(BlockBasedImage::new(
            &jpeg_header,
            i,
            0,
            jpeg_header.cmp_info[0].bcv,
        )?);
    }

    let start_scan_position = reader.stream_position()?;

    let mut partitions = Vec::new();
    read_first_scan(
        &jpeg_header,
        reader,
        &mut partitions,
        &mut image_data[..],
        rinfo,
    )
    .context()?;
    let mut end_scan_position = reader.stream_position()?;

    if start_scan_position + 2 > end_scan_position {
        return err_exit_code(ExitCode::UnsupportedJpeg, "no scan data found in JPEG file")
            .context();
    }

    if partitions.len() == 0 {
        return err_exit_code(
            ExitCode::UnsupportedJpeg,
            "no scan information found in JPEG file",
        )
        .context();
    }

    if jpeg_header.is_single_scan() {
        if rinfo.early_eof_encountered {
            if enabled_features.stop_reading_at_eoi {
                return err_exit_code(ExitCode::ShortRead, "early EOF encountered");
            }

            rinfo
                .truncate_components
                .set_truncation_bounds(&jpeg_header, rinfo.max_dpos);

            // If we got an early EOF, then seek backwards and capture the last two bytes and store them as garbage.
            // This is necessary since the decoder will assume that zero garbage always means a properly terminated JPEG
            // even if early EOF was set to true.
            end_scan_position = reader.seek(SeekFrom::Current(-2))?.try_into().unwrap();

            // make sure we don't return any partitions that are beyond the
            // adjusted end position
            for i in 0..partitions.len() {
                if partitions[i].0 >= end_scan_position {
                    return err_exit_code(
                        ExitCode::UnsupportedJpeg,
                        "Partition conflicts with garbage data",
                    );
                }
            }

            rinfo.garbage_data.resize(2, 0);
            reader.read_exact(&mut rinfo.garbage_data)?;
        }

        if enabled_features.stop_reading_at_eoi {
            // ensure there is an actual EOI marker since we haven't consumed it yet
            let mut end_of_file = [0u8; 2];
            reader.read_exact(&mut end_of_file).context()?;

            if end_of_file != EOI {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    "JPEG file does not end with EOI marker",
                )
                .context();
            }

            rinfo.garbage_data = end_of_file.to_vec();
        } else {
            // read the rest of the file to garbage data
            reader.read_to_end(&mut rinfo.garbage_data).context()?;
        }
    } else {
        assert!(jpeg_header.jpeg_type != JpegType::Unknown);

        if rinfo.early_eof_encountered {
            return err_exit_code(
                ExitCode::UnsupportedJpeg,
                "truncation is only supported for baseline images",
            )
            .context();
        }

        // for progressive images, loop around reading headers and decoding until we a complete image_data
        let mut prev_raw_jpeg_header_len = rinfo.raw_jpeg_header.len();

        while prepare_to_decode_next_scan(jpeg_header, rinfo, reader, enabled_features).context()? {
            on_header_callback(
                jpeg_header,
                &&rinfo.raw_jpeg_header[prev_raw_jpeg_header_len..],
            );
            prev_raw_jpeg_header_len = rinfo.raw_jpeg_header.len();

            read_progressive_scan(&jpeg_header, reader, &mut image_data[..], rinfo).context()?;

            if rinfo.early_eof_encountered {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    "truncation is only supported for baseline images",
                )
                .context();
            }
        }

        end_scan_position = reader.stream_position()?;

        // Since prepare_to_decode_next_scan consumed the EOI,
        // we need to add EOI to the beginning of the garbage data (if there is any).
        //
        // If there was actually no garbage data, this is still ok since
        // the marker will be appended, then removed when file gets truncated by the
        // overall file limit.
        rinfo.garbage_data = Vec::from(EOI);

        if !enabled_features.stop_reading_at_eoi {
            // append the rest of the file to the buffer
            reader.read_to_end(&mut rinfo.garbage_data).context()?;
        }
    }

    Ok((image_data, partitions, end_scan_position))
}

// false means we hit the end of file marker
fn prepare_to_decode_next_scan<R: Read>(
    jpeg_header: &mut JpegHeader,
    rinfo: &mut ReconstructionInfo,
    reader: &mut R,
    enabled_features: &EnabledFeatures,
) -> Result<bool> {
    // parse the header and store it in the raw_jpeg_header
    if !parse_jpeg_header(reader, enabled_features, jpeg_header, rinfo).context()? {
        return Ok(false);
    }

    rinfo.max_bpos = cmp::max(rinfo.max_bpos, u32::from(jpeg_header.cs_to));

    // FIXME: not sure why only first bit of csSah is examined but 4 bits of it are stored
    rinfo.max_sah = cmp::max(
        rinfo.max_sah,
        cmp::max(jpeg_header.cs_sal, jpeg_header.cs_sah),
    );

    for i in 0..jpeg_header.cs_cmpc {
        rinfo.max_cmp = cmp::max(rinfo.max_cmp, jpeg_header.cs_cmp[i] as u32);
    }

    return Ok(true);
}

/// Reads the scan from the JPEG file, writes the image data to the image_data array and
/// partitions it into restart segments using the partition callback.
///
/// This only works for sequential JPEGs or the first scan in a progressive image.
/// For subsequent scans, use the `read_progressive_scan`.
fn read_first_scan<R: BufRead + Seek>(
    jf: &JpegHeader,
    reader: &mut R,
    partitions: &mut Vec<(u64, RestartSegmentCodingInfo)>,
    image_data: &mut [BlockBasedImage],
    reconstruct_info: &mut ReconstructionInfo,
) -> Result<()> {
    let mut bit_reader = BitReader::new(reader);

    // init variables for decoding
    let mut state = JpegPositionState::new(jf, 0);

    let mut do_handoff = true;

    // JPEG imagedata decoding routines
    let mut sta = JpegDecodeStatus::DecodeInProgress;
    while sta != JpegDecodeStatus::ScanCompleted {
        // decoding for interleaved data
        state.reset_rstw(jf); // restart wait counter

        if jf.jpeg_type == JpegType::Sequential {
            sta = decode_baseline_rst(
                &mut state,
                &mut bit_reader,
                image_data,
                &mut do_handoff,
                jf,
                reconstruct_info,
                partitions,
            )
            .context()?;
        } else if jf.cs_to == 0 && jf.cs_sah == 0 {
            // only need DC
            jf.verify_huffman_table(true, false).context()?;

            let mut last_dc = [0i16; 4];

            while sta == JpegDecodeStatus::DecodeInProgress {
                let current_block = image_data[state.get_cmp()].get_block_mut(state.get_dpos());

                // collect the handoffs although for progressive images
                // we still split the scan into sections, but we don't partition the actual JPEG
                // writes since they have to be done on a single thread in a loop for a progressive file.
                //
                // TODO: get rid of this and just chop up the scan into sections in Lepton code
                if do_handoff {
                    partitions.push((
                        0,
                        RestartSegmentCodingInfo::new(0, 0, [0; 4], state.get_mcu(), jf),
                    ));

                    do_handoff = false;
                }

                // ---> succesive approximation first stage <---

                // diff coding & bitshifting for dc
                let coef = read_dc(&mut bit_reader, jf.get_huff_dc_tree(state.get_cmp()))?;

                let v = coef.wrapping_add(last_dc[state.get_cmp()]);
                last_dc[state.get_cmp()] = v;

                current_block.set_transposed_from_zigzag(0, v << jf.cs_sal);

                let old_mcu = state.get_mcu();
                sta = state.next_mcu_pos(jf);

                if state.get_mcu() % jf.mcuh == 0 && old_mcu != state.get_mcu() {
                    do_handoff = true;
                }
            }
        } else {
            return err_exit_code(
                ExitCode::UnsupportedJpeg,
                "progress must start with DC stage",
            )
            .context();
        }

        // if we saw a pad bit at the end of the block, then remember whether they were 1s or 0s. This
        // will be used later on to reconstruct the padding
        bit_reader
            .read_and_verify_fill_bits(&mut reconstruct_info.pad_bit)
            .context()?;

        // verify that we got the right RST code here since the above should do 1 mcu.
        // If we didn't then we won't re-encode the file binary identical so there's no point in continuing
        if sta == JpegDecodeStatus::RestartIntervalExpired {
            bit_reader.verify_reset_code().context()?;

            sta = JpegDecodeStatus::DecodeInProgress;
        }
    }

    Ok(())
}

/// Reads a scan for progressive images where the image is encoded in multiple passes.
/// Between each scan are a bunch of header than need to be parsed containing information
/// like updated Huffman tables and quantization tables.
fn read_progressive_scan<R: BufRead + Seek>(
    jf: &JpegHeader,
    reader: &mut R,
    image_data: &mut [BlockBasedImage],
    reconstruct_info: &mut ReconstructionInfo,
) -> Result<()> {
    // track to see how far we got in progressive encoding in case of truncated images, however this
    // was never actually implemented in the original C++ code
    reconstruct_info.max_sah = max(reconstruct_info.max_sah, max(jf.cs_sal, jf.cs_sah));

    let mut bit_reader = BitReader::new(reader);

    // init variables for decoding
    let mut state = JpegPositionState::new(jf, 0);

    // JPEG imagedata decoding routines
    let mut sta = JpegDecodeStatus::DecodeInProgress;
    while sta != JpegDecodeStatus::ScanCompleted {
        // decoding for interleaved data
        state.reset_rstw(&jf); // restart wait counter

        if jf.cs_to == 0 {
            if jf.cs_sah == 0 {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    "progress can't have two DC first stages",
                )
                .context();
            }

            // only need DC
            jf.verify_huffman_table(true, false).context()?;

            while sta == JpegDecodeStatus::DecodeInProgress {
                let current_block = image_data[state.get_cmp()].get_block_mut(state.get_dpos());

                // ---> progressive DC encoding <---

                // ---> succesive approximation later stage <---
                let value = bit_reader.read(1)? as i16;

                current_block.set_transposed_from_zigzag(
                    0,
                    current_block
                        .get_transposed_from_zigzag(0)
                        .wrapping_add(value << jf.cs_sal),
                );

                sta = state.next_mcu_pos(jf);
            }
        } else {
            // ---> progressive AC encoding <---

            if jf.cs_from == 0 || jf.cs_to >= 64 || jf.cs_from >= jf.cs_to {
                return err_exit_code(
                    ExitCode::UnsupportedJpeg,
                    format!(
                        "progressive encoding range was invalid {0} to {1}",
                        jf.cs_from, jf.cs_to
                    ),
                );
            }

            // only need AC
            jf.verify_huffman_table(false, true).context()?;

            if jf.cs_sah == 0 {
                if jf.cs_cmpc != 1 {
                    return err_exit_code(
                        ExitCode::UnsupportedJpeg,
                        "Progressive AC encoding cannot be interleaved",
                    );
                }

                // ---> succesive approximation first stage <---
                let mut block = [0; 64];

                while sta == JpegDecodeStatus::DecodeInProgress {
                    let current_block = image_data[state.get_cmp()].get_block_mut(state.get_dpos());

                    if state.eobrun == 0 {
                        // only need to do something if we are not in a zero-block run
                        let eob = decode_ac_prg_fs(
                            &mut bit_reader,
                            jf.get_huff_ac_tree(state.get_cmp()),
                            &mut block,
                            &mut state,
                            jf.cs_from,
                            jf.cs_to,
                        )
                        .context()?;

                        state
                            .check_optimal_eobrun(
                                eob == jf.cs_from,
                                jf.get_huff_ac_codes(state.get_cmp()),
                            )
                            .context()?;

                        for bpos in jf.cs_from..eob {
                            current_block.set_transposed_from_zigzag(
                                usize::from(bpos),
                                block[usize::from(bpos)] << jf.cs_sal,
                            );
                        }
                    }

                    sta = state.skip_eobrun(&jf).context()?;

                    // proceed only if no error encountered
                    if sta == JpegDecodeStatus::DecodeInProgress {
                        sta = state.next_mcu_pos(jf);
                    }
                }
            } else {
                // ---> succesive approximation later stage <---

                let mut block = [0; 64];

                while sta == JpegDecodeStatus::DecodeInProgress {
                    let current_block = image_data[state.get_cmp()].get_block_mut(state.get_dpos());

                    for bpos in jf.cs_from..jf.cs_to + 1 {
                        block[usize::from(bpos)] =
                            current_block.get_transposed_from_zigzag(usize::from(bpos));
                    }

                    if state.eobrun == 0 {
                        // decode block (long routine)
                        let eob = decode_ac_prg_sa(
                            &mut bit_reader,
                            jf.get_huff_ac_tree(state.get_cmp()),
                            &mut block,
                            &mut state,
                            jf.cs_from,
                            jf.cs_to,
                        )
                        .context()?;

                        state
                            .check_optimal_eobrun(
                                eob == jf.cs_from,
                                jf.get_huff_ac_codes(state.get_cmp()),
                            )
                            .context()?;
                    } else {
                        // decode zero run block (short routine)
                        decode_eobrun_sa(
                            &mut bit_reader,
                            &mut block,
                            &mut state,
                            jf.cs_from,
                            jf.cs_to,
                        )
                        .context()?;
                    }

                    // copy back to colldata
                    for bpos in jf.cs_from..jf.cs_to + 1 {
                        current_block.set_transposed_from_zigzag(
                            usize::from(bpos),
                            current_block
                                .get_transposed_from_zigzag(usize::from(bpos))
                                .wrapping_add(block[usize::from(bpos)] << jf.cs_sal),
                        );
                    }

                    sta = state.next_mcu_pos(jf);
                }
            }
        }

        // if we saw a pad bit at the end of the block, then remember whether they were 1s or 0s. This
        // will be used later on to reconstruct the padding
        bit_reader
            .read_and_verify_fill_bits(&mut reconstruct_info.pad_bit)
            .context()?;

        // verify that we got the right RST code here since the above should do 1 mcu.
        // If we didn't then we won't re-encode the file binary identical so there's no point in continuing
        if sta == JpegDecodeStatus::RestartIntervalExpired {
            bit_reader.verify_reset_code().context()?;

            sta = JpegDecodeStatus::DecodeInProgress;
        }
    }

    Ok(())
}

/// reads an entire interval until the RST code
fn decode_baseline_rst<R: BufRead + Seek>(
    state: &mut JpegPositionState,
    bit_reader: &mut BitReader<R>,
    image_data: &mut [BlockBasedImage],
    do_handoff: &mut bool,
    jpeg_header: &JpegHeader,
    reconstruct_info: &mut ReconstructionInfo,
    partitions: &mut Vec<(u64, RestartSegmentCodingInfo)>,
) -> Result<JpegDecodeStatus> {
    // should have both AC and DC components
    jpeg_header.verify_huffman_table(true, true).context()?;

    let mut sta = JpegDecodeStatus::DecodeInProgress;
    let mut lastdc = [0i16; 4]; // (re)set last DCs for diff coding

    while sta == JpegDecodeStatus::DecodeInProgress {
        if *do_handoff {
            let (bits_already_read, byte_being_read) = bit_reader.overhang();

            partitions.push((
                bit_reader.stream_position(),
                RestartSegmentCodingInfo::new(
                    byte_being_read,
                    bits_already_read,
                    lastdc,
                    state.get_mcu(),
                    &jpeg_header,
                ),
            ));

            *do_handoff = false;
        }

        if !bit_reader.is_eof() {
            // record the max block read
            reconstruct_info.max_dpos[state.get_cmp()] =
                cmp::max(state.get_dpos(), reconstruct_info.max_dpos[state.get_cmp()]);
        }

        // decode block (throws on error)
        let mut block = [0i16; 64];
        let eob = decode_block_seq(
            bit_reader,
            &jpeg_header.get_huff_dc_tree(state.get_cmp()),
            &jpeg_header.get_huff_ac_tree(state.get_cmp()),
            &mut block,
        )?;

        if eob > 1 && (block[eob - 1] == 0) {
            return err_exit_code(
                ExitCode::UnsupportedJpeg,
                "cannot encode image with eob after last 0",
            );
        }

        // fix dc
        block[0] = block[0].wrapping_add(lastdc[state.get_cmp()]);
        lastdc[state.get_cmp()] = block[0];

        // prepare and set transposed raster block from zigzagged
        let block_tr = AlignedBlock::zigzag_to_transposed(block);

        image_data[state.get_cmp()].set_block_data(state.get_dpos(), block_tr);

        // see if here is a good position to do a handoff (has to be aligned between MCU rows since we can't split any finer)
        let old_mcu = state.get_mcu();
        sta = state.next_mcu_pos(&jpeg_header);

        if state.get_mcu() % jpeg_header.mcuh == 0 && old_mcu != state.get_mcu() {
            *do_handoff = true;
        }

        if bit_reader.is_eof() {
            sta = JpegDecodeStatus::ScanCompleted;
            reconstruct_info.early_eof_encountered = true;
        }
    }

    return Ok(sta);
}

/// <summary>
/// sequential block decoding routine
/// </summary>
#[inline(never)]
pub(crate) fn decode_block_seq<R: BufRead>(
    bit_reader: &mut BitReader<R>,
    dctree: &HuffTree,
    actree: &HuffTree,
    block: &mut [i16; 64],
) -> Result<usize> {
    let mut eob = 64;

    // decode dc
    block[0] = read_dc(bit_reader, dctree)?;

    let mut eof_fixup = false;

    // decode ac
    let mut bpos: usize = 1;
    while bpos < 64 {
        // decode next
        if let Some((z, coef)) = read_coef(bit_reader, actree)? {
            if (z + bpos) >= 64 {
                eof_fixup = true;
                break;
            }

            // no need to write the zeros since we are already zero initialized
            bpos += z;

            block[bpos] = coef;
            bpos += 1;
        } else {
            // EOB
            eob = bpos;
            break;
        }
    }

    // if we hit EOF then the bitreader will just start returning long strings of 0s, so handle that. If this happenes
    // outside of that case, then it's a JPEG that we cannot recode successfully
    if eof_fixup {
        if !bit_reader.is_eof() {
            return err_exit_code(
                ExitCode::UnsupportedJpeg,
                "If 0run is longer than the block must be truncated",
            );
        }

        while bpos < eob {
            block[bpos] = 0;
            bpos += 1;
        }

        if eob > 0 {
            block[eob - 1] = 1; // set the value to something matching the EOB
        }
    }

    // return position of eob
    return Ok(eob);
}

/// Reads and decodes next Huffman code from BitReader using the provided tree
#[inline(always)]
fn next_huff_code<R: BufRead>(bit_reader: &mut BitReader<R>, ctree: &HuffTree) -> Result<u8> {
    let mut node: u16 = 0;

    while node < 256 {
        node = ctree.node[usize::from(node)][usize::from(bit_reader.read(1)?)];
    }

    if node == 0xffff {
        err_exit_code(ExitCode::UnsupportedJpeg, "illegal Huffman code detected")
    } else {
        Ok((node - 256) as u8)
    }
}

fn read_dc<R: BufRead>(bit_reader: &mut BitReader<R>, tree: &HuffTree) -> Result<i16> {
    let (z, coef) = read_coef(bit_reader, tree)?.unwrap_or((0, 0));
    if z != 0 {
        err_exit_code(
            ExitCode::UnsupportedJpeg,
            "not expecting non-zero run in DC coefficient",
        )
    } else {
        Ok(coef)
    }
}

#[inline(always)]
fn read_coef<R: BufRead>(
    bit_reader: &mut BitReader<R>,
    tree: &HuffTree,
) -> Result<Option<(usize, i16)>> {
    // if the code we found is smaller or equal to the number of bits left, take the shortcut
    let hc;

    loop {
        // peek ahead to see if we can decode the symbol immediately
        // given what has already been read into the bitreader
        let (peek_value, peek_len) = bit_reader.peek();

        // use lookup table to figure out the first code in this byte and how long it is
        let (code, code_len) = tree.peek_code[peek_value as usize];

        if u32::from(code_len) <= peek_len {
            // found code directly, so advance by the number of bits immediately
            hc = code;
            bit_reader.advance(u32::from(code_len));
            break;
        } else if peek_len < 8 {
            // peek code works with up to 8 bits at a time. If we had less
            // than this, then we need to read more bits into the bitreader
            bit_reader.fill_register(8)?;
        } else {
            // take slow path since we have a code that is bigger than 8 bits (but pretty rare)
            hc = next_huff_code(bit_reader, tree)?;
            break;
        }
    }

    // analyse code
    if hc != 0 {
        let z = usize::from(lbits(hc, 4));
        let literal_bits = rbits(hc, 4);

        let value = bit_reader.read(u32::from(literal_bits))?;
        Ok(Some((z, devli(literal_bits, value))))
    } else {
        Ok(None)
    }
}

/// progressive AC decoding (first pass)
fn decode_ac_prg_fs<R: BufRead>(
    bit_reader: &mut BitReader<R>,
    actree: &HuffTree,
    block: &mut [i16; 64],
    state: &mut JpegPositionState,
    from: u8,
    to: u8,
) -> Result<u8> {
    debug_assert!(state.eobrun == 0);

    // decode ac
    let mut bpos = from;
    while bpos <= to {
        // decode next
        let hc = next_huff_code(bit_reader, actree)?;

        let l = lbits(hc, 4);
        let r = rbits(hc, 4);

        // check if code is not an EOB or EOB run
        if (l == 15) || (r > 0) {
            // decode run/level combination
            let mut z = l;
            let s = r;
            let n = bit_reader.read(u32::from(s))?;
            if (z + bpos) > to {
                return err_exit_code(ExitCode::UnsupportedJpeg, "run is too long");
            }

            while z > 0 {
                // write zeroes
                block[usize::from(bpos)] = 0;
                z -= 1;
                bpos += 1;
            }
            block[usize::from(bpos)] = devli(s, n); // decode cvli
            bpos += 1;
        } else {
            // decode eobrun
            let s = l;
            let n = bit_reader.read(u32::from(s))?;
            state.eobrun = decode_eobrun_bits(s, n);

            state.eobrun -= 1; // decrement eobrun ( for this one )

            break;
        }
    }

    // return position of eob
    return Ok(bpos);
}

/// progressive AC SA decoding routine
fn decode_ac_prg_sa<R: BufRead>(
    bit_reader: &mut BitReader<R>,
    actree: &HuffTree,
    block: &mut [i16; 64],
    state: &mut JpegPositionState,
    from: u8,
    to: u8,
) -> Result<u8> {
    debug_assert!(state.eobrun == 0);

    let mut bpos = from;
    let mut eob = to;

    // decode AC succesive approximation bits
    while bpos <= to {
        // decode next
        let hc = next_huff_code(bit_reader, actree)?;

        let l = lbits(hc, 4);
        let r = rbits(hc, 4);

        // check if code is not an EOB or EOB run
        if (l == 15) || (r > 0) {
            // decode run/level combination
            let mut z = l;
            let s = r;
            let v;

            if s == 0 {
                v = 0;
            } else if s == 1 {
                let n = bit_reader.read(1)?;
                v = if n == 0 { -1 } else { 1 }; // fast decode vli
            } else {
                return err_exit_code(ExitCode::UnsupportedJpeg, "decoding error").context();
            }

            // write zeroes / write correction bits
            loop {
                if block[usize::from(bpos)] == 0 {
                    // skip zeroes / write value
                    if z > 0 {
                        z -= 1;
                    } else {
                        block[usize::from(bpos)] = v;
                        bpos += 1;
                        break;
                    }
                } else {
                    // read correction bit
                    let n = bit_reader.read(1)? as i16;
                    block[usize::from(bpos)] = if block[usize::from(bpos)] > 0 { n } else { -n };
                }

                if bpos >= to {
                    return err_exit_code(ExitCode::UnsupportedJpeg, "decoding error").context();
                }

                bpos += 1;
            }
        } else {
            // decode eobrun
            eob = bpos;
            let s = l;
            let n = bit_reader.read(u32::from(s))?;
            state.eobrun = decode_eobrun_bits(s, n);

            // since we hit EOB, the rest can be done with the zero block decoder
            decode_eobrun_sa(bit_reader, block, state, bpos, to)?;

            break;
        }
    }

    return Ok(eob);
}

/// fast eobrun decoding routine for succesive approximation when the entire block is zero
fn decode_eobrun_sa<R: BufRead>(
    bit_reader: &mut BitReader<R>,
    block: &mut [i16; 64],
    state: &mut JpegPositionState,
    from: u8,
    to: u8,
) -> Result<()> {
    debug_assert!(state.eobrun > 0);

    for bpos in usize::from(from)..usize::from(to + 1) {
        if block[bpos] != 0 {
            let n = bit_reader.read(1)? as i16;
            block[bpos] = if block[bpos] > 0 { n } else { -n };
        }
    }

    // decrement eobrun
    state.eobrun -= 1;

    Ok(())
}

/// decoding for decoding eobrun lengths. The encoding chops off the most significant
/// bit since it is always 1, so we need to add it back.
fn decode_eobrun_bits(s: u8, n: u16) -> u16 {
    n + (1 << s)
}

#[cfg(test)]
mod tests {
    use super::*;

    use crate::{
        EnabledFeatures,
        jpeg::jpeg_header::{JpegHeader, ReconstructionInfo},
    };
    use std::io::{BufRead, Seek};

    #[test]
    fn read_garbage_behavior_progressive() {
        read_garbage_behavior("iphoneprogressive");
    }

    #[test]
    fn read_garbage_behavior_baseline() {
        read_garbage_behavior("iphone");
    }

    /// reads a JPEG file and verifies that the garbage data is handled correctly.
    fn read_garbage_behavior(filename: &str) {
        let mut file = read_file(filename, ".jpg");
        let mut enabled_features = crate::EnabledFeatures::compat_lepton_scalar_read();

        let mut cursor = std::io::Cursor::new(&file);
        let (rinfo, _jh) = read_jpeg(&mut cursor, &enabled_features);

        assert_eq!(
            &rinfo.garbage_data[..],
            [0xff, 0xd9],
            "Expected garbage data to match what was written"
        );

        // now add some garbage data to the end of the file
        file.extend_from_slice(b"hi"); // EOI + some garbage

        let mut cursor = std::io::Cursor::new(&file);
        let (rinfo, _jh) = read_jpeg(&mut cursor, &enabled_features);

        assert_eq!(
            &rinfo.garbage_data[..],
            [0xff, 0xd9, b'h', b'i'],
            "Expected garbage data to match what was written"
        );

        enabled_features.stop_reading_at_eoi = true;
        let mut cursor = std::io::Cursor::new(&file);
        let (rinfo, _jh) = read_jpeg(&mut cursor, &enabled_features);

        assert_eq!(cursor.position(), file.len() as u64 - 2);

        assert_eq!(
            &rinfo.garbage_data[..],
            [0xff, 0xd9],
            "Expected garbage data to match what was written when stop_reading_at_eoi is true"
        );
    }

    /// test function to read a JPEG file and returns the reconstruction info and JPEG header
    fn read_jpeg<R: BufRead + Seek>(
        reader: &mut R,
        enabled_features: &EnabledFeatures,
    ) -> (ReconstructionInfo, JpegHeader) {
        let mut jpeg_header = JpegHeader::default();
        let mut rinfo = ReconstructionInfo::default();

        let mut headers = Vec::new();

        let (_image_data, _partitions, _end_scan_position) = read_jpeg_file(
            reader,
            &mut jpeg_header,
            &mut rinfo,
            &enabled_features,
            |header, raw_header| {
                headers.push((header.clone(), raw_header.to_vec()));
            },
        )
        .unwrap();

        (rinfo, jpeg_header)
    }

    #[test]
    fn test_benchmark_read_block() {
        let mut f = benchmarks::benchmark_read_block();
        for _ in 0..10 {
            f();
        }
    }

    #[test]
    fn test_benchmark_read_jpeg() {
        let mut f = benchmarks::benchmark_read_jpeg();
        for _ in 0..10 {
            f();
        }
    }
}

#[cfg(any(test, feature = "micro_benchmark"))]
pub mod benchmarks {
    use std::io::Cursor;

    use crate::{
        EnabledFeatures,
        helpers::read_file,
        jpeg::{
            bit_reader::BitReader,
            bit_writer::BitWriter,
            block_based_image::AlignedBlock,
            jpeg_header::{
                HuffTree, JpegHeader, ReconstructionInfo, generate_huff_table_from_distribution,
            },
            jpeg_read::{decode_block_seq, read_jpeg_file},
            jpeg_write::encode_block_seq,
        },
    };

    /// reads the jpeg file from the test data and returns a closure that reads
    /// the jpeg header from it. Used for micro-benchmarking the jpeg header read performance.
    #[inline(never)]
    pub fn benchmark_read_jpeg() -> Box<dyn FnMut()> {
        let file = read_file("android", ".jpg");

        Box::new(move || {
            use std::hint::black_box;

            let mut reader = std::io::Cursor::new(&file);
            let enabled_features = EnabledFeatures::compat_lepton_vector_write();

            let mut jpeg_header = JpegHeader::default();
            let mut rinfo = ReconstructionInfo::default();

            let (image_data, partitions, end_scan) = read_jpeg_file(
                &mut reader,
                &mut jpeg_header,
                &mut rinfo,
                &enabled_features,
                |_, _| {},
            )
            .unwrap();

            black_box((image_data, partitions, end_scan));
        })
    }

    /// tests performance of decoding a single block
    #[inline(never)]
    pub fn benchmark_read_block() -> Box<dyn FnMut()> {
        // create a weird distribution to test the huffman encoding for corner cases
        let mut dcdistribution = [0; 256];
        for i in 0..256 {
            dcdistribution[i] = 256 - i;
        }
        let dctbl = generate_huff_table_from_distribution(&dcdistribution);

        let mut acdistribution = [0; 256];
        for i in 0..256 {
            acdistribution[i] = 1 + 256;
        }
        let actbl = generate_huff_table_from_distribution(&acdistribution);

        let mut bitwriter = BitWriter::new(Vec::with_capacity(1024));

        let mut block = AlignedBlock::default();
        for i in 0..10 {
            block.get_block_mut()[i] = i as i16 * 13;
        }
        for i in 30..50 {
            block.get_block_mut()[i] = -(i as i16) * 7;
        }
        for i in 50..52 {
            block.get_block_mut()[i] = i as i16 * 3;
        }

        encode_block_seq(&mut bitwriter, &dctbl, &actbl, &block);

        let buffer = bitwriter.detach_buffer();

        let dctree = HuffTree::construct_hufftree(&dctbl, true).unwrap();
        let actree = HuffTree::construct_hufftree(&actbl, false).unwrap();

        Box::new(move || {
            use std::hint::black_box;

            let mut bitreader = BitReader::new(Cursor::new(&buffer));

            let mut outblock = AlignedBlock::default();
            decode_block_seq(
                &mut bitreader,
                &dctree,
                &actree,
                &mut outblock.get_block_mut(),
            )
            .unwrap();

            black_box(outblock);
        })
    }
}


================================================
FILE: lib/src/jpeg/jpeg_write.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

/*
Copyright (c) 2006...2016, Matthias Stirner and HTW Aalen University
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use bytemuck::{cast, cast_ref};
use wide::{CmpEq, i16x16};

use crate::consts::{JpegDecodeStatus, JpegType};
use crate::helpers::u16_bit_length;
use crate::lepton_error::{AddContext, ExitCode, err_exit_code};

use crate::Result;

use super::bit_writer::BitWriter;
use super::block_based_image::{AlignedBlock, BlockBasedImage};
use super::jpeg_code;
use super::jpeg_header::{HuffCodes, JpegHeader, ReconstructionInfo, RestartSegmentCodingInfo};
use super::jpeg_position_state::JpegPositionState;
use super::row_spec::RowSpec;

pub struct JpegIncrementalWriter<'a> {
    last_dc: [i16; 4],
    huffw: BitWriter,
    reconstruction_info: &'a ReconstructionInfo,
    jpeg_header: &'a JpegHeader,
    capacity: usize,
    current_scan_index: usize,
}

impl<'a> JpegIncrementalWriter<'a> {
    pub fn new(
        capacity: usize,
        reconstruction_info: &'a ReconstructionInfo,
        rinfo: Option<&RestartSegmentCodingInfo>,
        jpeg_header: &'a JpegHeader,
        current_scan_index: usize,
    ) -> JpegIncrementalWriter<'a> {
        let mut huffw = BitWriter::new(Vec::with_capacity(capacity));

        if let Some(rinfo) = rinfo {
            huffw.reset_from_overhang_byte_and_num_bits(
                rinfo.overhang_byte,
                u32::from(rinfo.num_overhang_bits),
            );
        }

        JpegIncrementalWriter {
            last_dc: if let Some(r) = rinfo {
                r.last_dc
            } else {
                [0i16; 4]
            },
            huffw,
            jpeg_header,
            reconstruction_info,
            capacity,
            current_scan_index,
        }
    }

    pub fn amount_buffered(&self) -> usize {
        self.huffw.amount_buffered()
    }

    pub fn process_row(
        &mut self,
        cur_row: &RowSpec,
        image_data: &[BlockBasedImage],
    ) -> Result<bool> {
        if cur_row.last_row_to_complete_mcu {
            self.huffw.ensure_space(self.capacity);

            return Ok(recode_one_mcu_row(
                &mut self.huffw,
                cur_row.mcu_row_index * self.jpeg_header.mcuh.get(),
                &mut self.last_dc,
                image_data,
                self.jpeg_header,
                self.reconstruction_info,
                self.current_scan_index,
            )
            .context()?);
        }
        Ok(false)
    }

    pub fn detach_buffer(&mut self) -> Vec<u8> {
        self.huffw.detach_buffer()
    }
}

/// writes an entire scan vs only a range of rows as above.
/// supports progressive encoding whereas the row range version does not
pub fn jpeg_write_entire_scan(
    image_data: &[BlockBasedImage],
    jpeg_header: &JpegHeader,
    rinfo: &ReconstructionInfo,
    current_scan_index: usize,
) -> Result<Vec<u8>> {
    let mut inc_write =
        JpegIncrementalWriter::new(128 * 1024, rinfo, None, jpeg_header, current_scan_index);

    let max_coded_heights = rinfo.truncate_components.get_max_coded_heights();

    let mut decode_index = 0;
    loop {
        let cur_row = RowSpec::get_row_spec_from_index(
            decode_index,
            image_data,
            jpeg_header.mcuv.get(),
            &max_coded_heights,
        );

        decode_index += 1;

        if cur_row.done {
            break;
        }

        if cur_row.skip {
            continue;
        }

        if inc_write.process_row(&cur_row, image_data)? {
            break;
        }
    }

    Ok(inc_write.detach_buffer())
}

#[inline(never)]
fn recode_one_mcu_row(
    huffw: &mut BitWriter,
    mcu: u32,
    lastdc: &mut [i16],
    framebuffer: &[BlockBasedImage],
    jf: &JpegHeader,
    rinfo: &ReconstructionInfo,
    current_scan_index: usize,
) -> Result<bool> {
    let mut state = JpegPositionState::new(jf, mcu);

    let mut cumulative_reset_markers = state.get_cumulative_reset_markers(jf);

    let mut end_of_row = false;
    let mut correction_bits = Vec::new();

    // JPEG imagedata encoding routines
    while !end_of_row {
        // (re)set status
        let mut sta = JpegDecodeStatus::DecodeInProgress;

        // ---> sequential interleaved encoding <---
        while sta == JpegDecodeStatus::DecodeInProgress {
            let current_block = framebuffer[state.get_cmp()].get_block(state.get_dpos());

            let old_mcu = state.get_mcu();

            if jf.jpeg_type == JpegType::Sequential {
                // unzigzag
                let mut block = current_block.zigzag_from_transposed();

                // diff coding for dc
                let dc = block.get_block()[0];
                block.get_block_mut()[0] -= lastdc[state.get_cmp()];
                lastdc[state.get_cmp()] = dc;

                // encode block
                encode_block_seq(
                    huffw,
                    jf.get_huff_dc_codes(state.get_cmp()),
                    jf.get_huff_ac_codes(state.get_cmp()),
                    &block,
                );

                sta = state.next_mcu_pos(&jf);
            } else if jf.cs_to == 0 {
                // ---> progressive DC encoding <---
                if jf.cs_sah == 0 {
                    // ---> succesive approximation first stage <---

                    // diff coding & bitshifting for dc
                    let tmp = current_block.get_transposed_from_zigzag(0) >> jf.cs_sal;
                    let v = tmp - lastdc[state.get_cmp()];
                    lastdc[state.get_cmp()] = tmp;

                    // encode dc
                    write_coef(
                        huffw,
                        v < 0,
                        v.unsigned_abs(),
                        0,
                        jf.get_huff_dc_codes(state.get_cmp()),
                    );
                } else {
                    // ---> succesive approximation later stage <---

                    // fetch bit from current bitplane
                    huffw.write(
                        ((current_block.get_transposed_from_zigzag(0) >> jf.cs_sal) & 1) as u32,
                        1,
                    );
                }

                sta = state.next_mcu_pos(jf);
            } else {
                // ---> progressive AC encoding <---

                // copy from coefficients we need and shift right by cs_sal
                let mut block = [0i16; 64];
                for bpos in jf.cs_from..jf.cs_to + 1 {
                    block[usize::from(bpos)] = div_pow2(
                        current_block.get_transposed_from_zigzag(usize::from(bpos)),
                        jf.cs_sal,
                    );
                }

                if jf.cs_sah == 0 {
                    // ---> succesive approximation first stage <---

                    // encode block
                    encode_ac_prg_fs(
                        huffw,
                        jf.get_huff_ac_codes(state.get_cmp()),
                        &block,
                        &mut state,
                        jf.cs_from,
                        jf.cs_to,
                    )
                    .context()?;

                    sta = state.next_mcu_pos(jf);

                    // encode remaining eobrun (iff end of mcu or scan)
                    if sta != JpegDecodeStatus::DecodeInProgress {
                        encode_eobrun(huffw, jf.get_huff_ac_codes(state.get_cmp()), &mut state);
                    }
                } else {
                    // ---> succesive approximation later stage <---

                    // encode block
                    encode_ac_prg_sa(
                        huffw,
                        jf.get_huff_ac_codes(state.get_cmp()),
                        &block,
                        &mut state,
                        jf.cs_from,
                        jf.cs_to,
                        &mut correction_bits,
                    )
                    .context()?;

                    sta = state.next_mcu_pos(jf);

                    // encode remaining eobrun and correction bits (iff end of mcu or scan)
                    if sta != JpegDecodeStatus::DecodeInProgress {
                        encode_eobrun(huffw, jf.get_huff_ac_codes(state.get_cmp()), &mut state);

                        // encode remaining correction bits
                        encode_crbits(huffw, &mut correction_bits);
                    }
                }
            }

            if old_mcu != state.get_mcu() && state.get_mcu() % jf.mcuh == 0 {
                end_of_row = true;
                if sta == JpegDecodeStatus::DecodeInProgress {
                    // completed only MCU aligned row, not reset interval so don't emit anything special
                    return Ok(false);
                }
            }
        }

        // pad huffman writer
        huffw.pad(rinfo.pad_bit.unwrap_or(0));

        assert!(
            huffw.has_no_remainder(),
            "shouldnt have a remainder after padding"
        );

        // evaluate status
        if sta == JpegDecodeStatus::ScanCompleted {
            return Ok(true); // leave decoding loop, everything is done here
        } else {
            assert!(sta == JpegDecodeStatus::RestartIntervalExpired);

            // status 1 means restart
            if jf.rsti > 0 {
                if rinfo.rst_cnt.len() == 0
                    || (!rinfo.rst_cnt_set)
                    || cumulative_reset_markers < rinfo.rst_cnt[current_scan_index]
                {
                    let rst = jpeg_code::RST0 + (cumulative_reset_markers & 7) as u8;

                    huffw.write_byte_unescaped(0xFF);
                    huffw.write_byte_unescaped(rst);
                    cumulative_reset_markers += 1;
                }

                // (re)set rst wait counter
                state.reset_rstw(jf);

                // (re)set last DCs for diff coding
                for i in 0..lastdc.len() {
                    lastdc[i] = 0;
                }
            }
        }
    }

    Ok(false)
}

#[inline(never)]
pub(crate) fn encode_block_seq(
    huffw: &mut BitWriter,
    dctbl: &HuffCodes,
    actbl: &HuffCodes,
    block: &AlignedBlock,
) {
    // using SIMD instructions, construct a 64 bit mask of all
    // the non-zero coefficients in the block. This can be used
    // to efficiently skip zero blocks using trailing zero scan.
    let block_simd: &[i16x16; 4] = cast_ref(block.get_block());

    let mut mask = (block_simd[0].simd_eq(i16x16::ZERO).to_bitmask() as u64)
        | ((block_simd[1].simd_eq(i16x16::ZERO).to_bitmask() as u64) << 16)
        | ((block_simd[2].simd_eq(i16x16::ZERO).to_bitmask() as u64) << 32)
        | ((block_simd[3].simd_eq(i16x16::ZERO).to_bitmask() as u64) << 48);

    // abs value of all coefficients. Super fast to calculate here
    // for everything, even if it is zero and not needed.
    let abs_value: [u16; 64] = cast(block_simd.map(|x| x.abs()));
    let is_neg: [u16; 64] = cast(block_simd.map(|x| x >> 15));

    // encode DC
    // & 256 is bit faster all the bits are 1s and since it allows the optimizer
    //   to convert << 8 (inside this function) to a single AND
    write_coef(huffw, (is_neg[0] & 256) != 0, abs_value[0], 0, dctbl);

    // flip the bits since cmp_eq returns 0xffff for zero coefficients
    mask = !mask;

    // already processed DC coefficient, so skip it
    mask >>= 1;
    let mut bpos = 1;

    // encode ACs
    while mask != 0 {
        let mut zeros = mask.trailing_zeros();

        if zeros > 15 {
            // JPEG encoding only supports 15 zeros in a row. Most implementations
            // write 0xf0 codes for 16 zeros in a row, but we don't need
            // a special case since write_coef with a zero coefficient
            // and a 0xf zero count will write the correct code.
            zeros = 15;
        }

        bpos += zeros + 1;
        mask >>= zeros + 1;

        write_coef(
            huffw,
            (is_neg[(bpos - 1) as usize] & 256) != 0, // a bit faster since it allows the optimizer to convert << 8 (inside this function) to a single AND
            abs_value[(bpos - 1) as usize],
            zeros,
            actbl,
        );

        if bpos >= 64 {
            // if we get all 64 coefficients, we're done and don't need an EOB
            return;
        }
    }

    // write EOB since we didn't get all 64 coefficients
    huffw.write(actbl.c_val[0x00].into(), actbl.c_len[0x00].into());
}

/// encodes a coefficient which is a huffman code specifying the size followed
/// by the coefficient itself
#[inline(always)]
fn write_coef(huffw: &mut BitWriter, is_neg: bool, abs_coef: u16, z: u32, tbl: &HuffCodes) {
    let s = 32 - u32::from(abs_coef).leading_zeros();

    // compiler is smart enough to figure out that this will never be >= 256,
    // so no bounds check
    let hc = z << 4 | s;

    // JPEG stores the coefficient with an implied sign bit, since once we know the
    // number of bits, we can infer the sign.
    //
    // Eg, if the bitlength of the absolute value is 4,
    //
    // 0..7 are negative (corresponding to -15..-8)
    // 8..15 are positive
    //
    // This is equivalent to absolute value XOR (1 << bitlength) - 1 if the number is negative, so
    // what we do is store this adjustment in c_val_shift_s so that we don't need
    // to calculate it separately.
    //
    // is_neg indicates whether we want the value with the bits set.
    let val = tbl.c_val_shift_s[(hc | ((is_neg as u32) << 8)) as usize] ^ u32::from(abs_coef);

    let new_bits = u32::from(tbl.c_len_plus_s[hc as usize]);

    // write to huffman writer (combine hufmman code and coefficient bits into single write)
    huffw.write(val, new_bits);
}

/// progressive AC encoding (first pass)
fn encode_ac_prg_fs(
    huffw: &mut BitWriter,
    actbl: &HuffCodes,
    block: &[i16; 64],
    state: &mut JpegPositionState,
    from: u8,
    to: u8,
) -> Result<()> {
    // encode AC
    let mut z = 0;
    for bpos in from..to + 1 {
        // if nonzero is encountered
        let tmp = block[usize::from(bpos)];
        if tmp != 0 {
            // encode eobrun
            encode_eobrun(huffw, actbl, state);
            // write remaining zeroes
            while z >= 16 {
                huffw.write(actbl.c_val[0xF0].into(), actbl.c_len[0xF0].into());
                z -= 16;
            }

            // vli encode
            write_coef(huffw, tmp < 0, tmp.unsigned_abs(), z, actbl);

            // reset zeroes
            z = 0;
        } else {
            // increment zero counter
            z += 1;
        }
    }

    // check eob, increment eobrun if needed
    if z > 0 {
        if actbl.max_eob_run == 0 {
            return err_exit_code(
                ExitCode::UnsupportedJpeg,
                "there must be at least one EOB symbol run in the huffman table to encode EOBs",
            )
            .context();
        }

        state.eobrun += 1;

        // check eobrun, encode if needed
        if state.eobrun == actbl.max_eob_run {
            encode_eobrun(huffw, actbl, state);
        }
    }

    Ok(())
}

/// progressive AC SA encoding subsequent pass
fn encode_ac_prg_sa(
    huffw: &mut BitWriter,
    actbl: &HuffCodes,
    block: &[i16; 64],
    state: &mut JpegPositionState,
    from: u8,
    to: u8,
    correction_bits: &mut Vec<u8>,
) -> Result<()> {
    // check if block contains any newly nonzero coefficients and find out position of eob
    let mut eob = from;

    {
        let mut bpos = to;
        while bpos >= from {
            if (block[usize::from(bpos)] == 1) || (block[usize::from(bpos)] == -1) {
                eob = bpos + 1;
                break;
            }
            bpos -= 1;
        }
    }

    // encode eobrun if needed
    if (eob > from) && state.eobrun > 0 {
        encode_eobrun(huffw, actbl, state);

        encode_crbits(huffw, correction_bits);
    }

    // encode AC
    let mut z = 0;
    for bpos in from..eob {
        let tmp = block[usize::from(bpos)];
        // if zero is encountered
        if tmp == 0 {
            z += 1; // increment zero counter
            if z == 16 {
                // write zeroes if needed
                huffw.write(actbl.c_val[0xF0].into(), actbl.c_len[0xF0].into());

                encode_crbits(huffw, correction_bits);
                z = 0;
            }
        }
        // if nonzero is encountered
        else if (tmp == 1) || (tmp == -1) {
            // vli encode
            write_coef(huffw, tmp < 0, tmp.unsigned_abs(), z, actbl);

            // write correction bits
            encode_crbits(huffw, correction_bits);
            // reset zeroes
            z = 0;
        } else {
            // store correction bits
            let n = (block[usize::from(bpos)] & 0x1) as u8;
            correction_bits.push(n);
        }
    }

    // fast processing after eob
    for bpos in eob..to + 1 {
        if block[usize::from(bpos)] != 0 {
            // store correction bits
            let n = (block[usize::from(bpos)] & 0x1) as u8;
            correction_bits.push(n);
        }
    }

    // check eob, increment eobrun if needed
    if eob <= to {
        if actbl.max_eob_run == 0 {
            return err_exit_code(
                ExitCode::UnsupportedJpeg,
                "there must be at least one EOB symbol run in the huffman table to encode EOBs",
            )
            .context();
        }

        state.eobrun += 1;

        // check eobrun, encode if needed
        if state.eobrun == actbl.max_eob_run {
            encode_eobrun(huffw, actbl, state);

            encode_crbits(huffw, correction_bits);
        }
    }

    Ok(())
}

/// encodes the eob run which consists of a huffman code the high 4 bits specifying the log2 of the run
/// followed by the number number encoded into the minimum number of bits
fn encode_eobrun(huffw: &mut BitWriter, actbl: &HuffCodes, state: &mut JpegPositionState) {
    if (state.eobrun) > 0 {
        debug_assert!((state.eobrun) <= actbl.max_eob_run);

        let mut s = u16_bit_length(state.eobrun);
        s -= 1;

        let n = encode_eobrun_bits(s, state.eobrun);
        let hc = s << 4;
        huffw.write(
            actbl.c_val[usize::from(hc)].into(),
            actbl.c_len[usize::from(hc)].into(),
        );
        huffw.write(u32::from(n), u32::from(s));
        state.eobrun = 0;
    }
}

/// encodes the correction bits, which are simply encoded as a vector of single bit values
fn encode_crbits(huffw: &mut BitWriter, correction_bits: &mut Vec<u8>) {
    for x in correction_bits.drain(..) {
        huffw.write(u32::from(x), 1);
    }
}

/// divide power of 2 rounding towards zero
fn div_pow2(v: i16, p: u8) -> i16 {
    (if v < 0 { v + ((1 << p) - 1) } else { v }) >> p
}

/// encoding for eobrun length. Chop off highest bit since we know it is always 1.
fn encode_eobrun_bits(s: u8, v: u16) -> u16 {
    v - (1 << s)
}

#[cfg(test)]
mod tests {
    use std::io::Cursor;

    use super::*;

    use crate::{
        helpers::read_file,
        jpeg::{
            bit_reader::BitReader,
            bit_writer::BitWriter,
            block_based_image::AlignedBlock,
            jpeg_header::{HuffTree, generate_huff_table_from_distribution},
            jpeg_read::decode_block_seq,
        },
    };

    /// roundtrips a block through the encoder and decoder and checks that the output matches the input
    fn round_trip_block(block: &AlignedBlock, expected: &[u8]) {
        let mut bitwriter = BitWriter::new(Vec::with_capacity(1024));

        // create a weird distribution to test the huffman encoding for corner cases
        let mut dcdistribution = [0; 256];
        for i in 0..256 {
            dcdistribution[i] = 256 - i;
        }
        let dctbl = generate_huff_table_from_distribution(&dcdistribution);

        let mut acdistribution = [0; 256];
        for i in 0..256 {
            acdistribution[i] = 1 + 256;
        }
        let actbl = generate_huff_table_from_distribution(&acdistribution);

        encode_block_seq(&mut bitwriter, &dctbl, &actbl, block);

        bitwriter.pad(0);

        let buf = bitwriter.detach_buffer();
        assert_eq!(buf, expected);

        let mut bitreader = BitReader::new(Cursor::new(&buf));

        let mut block_decoded = [0i16; 64];
        decode_block_seq(
            &mut bitreader,
            &HuffTree::construct_hufftree(&dctbl, true).unwrap(),
            &HuffTree::construct_hufftree(&actbl, true).unwrap(),
            &mut block_decoded,
        )
        .unwrap();

        assert_eq!(&block_decoded, block.get_block());
    }

    #[test]
    fn test_encode_block_seq() {
        let mut block = AlignedBlock::default();
        for i in 0..64 {
            block.get_block_mut()[i] = (i as i16) - 32;
        }

        let expected = [
            152, 252, 176, 37, 131, 44, 41, 97, 203, 18, 88, 178, 198, 150, 60, 178, 37, 147, 44,
            169, 101, 203, 50, 89, 178, 206, 150, 126, 176, 107, 14, 177, 107, 30, 178, 107, 46,
            179, 107, 56, 136, 17, 34, 40, 69, 128, 128, 47, 120, 250, 3, 0, 226, 48, 70, 136, 225,
            31, 173, 26, 211, 173, 90, 215, 173, 154, 219, 173, 218, 223, 45, 9, 104, 203, 74, 90,
            114, 212, 150, 172, 181, 165, 175, 45, 137, 108, 203, 106, 91, 114, 220, 150, 236, 183,
            165, 190,
        ];

        round_trip_block(&block, &expected);
    }

    /// make sure we encode magnitudes correctly
    #[test]
    fn test_encode_block_magnitude() {
        let mut block = AlignedBlock::default();
        for i in 0..15 {
            block.get_block_mut()[i] = (1u16 << i) as i16;
        }
        for i in 0..15 {
            block.get_block_mut()[i + 20] = -((1u16 << i) as i16);
        }

        let expected = [
            165, 1, 132, 102, 180, 75, 64, 138, 6, 248, 8, 16, 27, 208, 13, 120, 2, 122, 0, 75,
            192, 4, 60, 0, 8, 224, 0, 109, 128, 1, 250, 1, 68, 94, 179, 203, 60, 137, 246, 247,
            232, 15, 251, 207, 253, 119, 254, 121, 255, 0, 203, 191, 252, 59, 255, 0, 200, 223,
            255, 0, 109, 127, 254, 0,
        ];

        round_trip_block(&block, &expected);
    }

    /// test encoding with gaps to test zero counting
    #[test]
    fn test_encode_block_zero_runs() {
        let mut block = AlignedBlock::default();

        for i in 0..10 {
            block.get_block_mut()[i] = i as i16;
        }
        for i in 30..50 {
            block.get_block_mut()[i] = -(i as i16);
        }
        for i in 50..52 {
            block.get_block_mut()[i] = i as i16;
        }

        let expected = [
            169, 223, 1, 128, 113, 24, 35, 68, 112, 143, 214, 141, 105, 167, 249, 12, 176, 8, 159,
            34, 120, 137, 210, 39, 8, 155, 34, 104, 137, 146, 38, 8, 151, 34, 88, 137, 82, 37, 8,
            147, 34, 72, 137, 18, 36, 8, 143, 34, 56, 139, 34, 44, 192, 0,
        ];

        round_trip_block(&block, &expected);
    }

    /// test encoding with gaps to test zero counting
    #[test]
    fn test_encode_block_long_zero_cnt() {
        let mut block = AlignedBlock::default();

        block.get_block_mut()[63] = 1;

        let expected = [169, 79, 79, 79, 33];

        round_trip_block(&block, &expected);
    }

    #[test]
    fn test_encode_block_seq_zero() {
        let block = AlignedBlock::default();

        let expected = [168, 0];

        round_trip_block(&block, &expected);
    }

    fn roundtrip_jpeg<R: std::io::BufRead + std::io::Seek>(
        reader: &mut R,
        enabled_features: &crate::EnabledFeatures,
    ) -> Vec<u8> {
        use crate::consts::*;
        use crate::jpeg::jpeg_header::{JpegHeader, ReconstructionInfo};
        use crate::jpeg::jpeg_read::read_jpeg_file;

        let mut jpeg_header = JpegHeader::default();
        let mut rinfo = ReconstructionInfo::default();

        let mut headers = Vec::new();

        let (image_data, partitions, end_scan_position) = read_jpeg_file(
            reader,
            &mut jpeg_header,
            &mut rinfo,
            &enabled_features,
            |header, raw_header| {
                headers.push((header.clone(), raw_header.to_vec()));
            },
        )
        .unwrap();

        let mut reconstructed = Vec::new();
        reconstructed.extend_from_slice(&SOI);

        if jpeg_header.is_single_scan() {
            // sequential JPEG consists of a single header + scan
            reconstructed.extend_from_slice(rinfo.raw_jpeg_header.as_slice());

            let mut prev_offset = 0;
            for (offset, coding_info) in partitions {
                let mut r = jpeg_write_baseline_row_range(
                    (offset - prev_offset) as usize,
                    &coding_info,
                    &image_data,
                    &jpeg_header,
                    &rinfo,
                )
                .unwrap();

                reconstructed.append(&mut r);

                prev_offset = offset;
            }

            assert_eq!(reconstructed.len(), end_scan_position as usize);

            reconstructed.extend_from_slice(&EOI);
        } else {
            // progressive JPEG consists of header + scan, header + scan, etc
            let mut scnc = 0;

            for (jh, raw_header) in headers {
                // progressive JPEG consists of headers + scan
                reconstructed.extend_from_slice(&raw_header);

                let scan = jpeg_write_entire_scan(&image_data, &jh, &rinfo, scnc).unwrap();

                reconstructed.extend_from_slice(&scan);

                // advance to next scan
                scnc += 1;
            }

            reconstructed.extend_from_slice(&EOI);

            // progressive includes EOI in the scan
            assert_eq!(reconstructed.len(), end_scan_position as usize);
        }

        reconstructed
    }

    /// reads a JPEG file and writes it back out using the baseline encoder
    /// to verify that the encoder and decoder exactly the same.
    #[test]
    fn roundtrip_baseline_jpeg() {
        let file = read_file("iphone", ".jpg");
        let enabled_features = crate::EnabledFeatures::compat_lepton_scalar_read();

        let reconstructed = roundtrip_jpeg(&mut std::io::Cursor::new(&file), &enabled_features);

        assert!(reconstructed == file);
    }

    /// reads a progressive JPEG file and writes it back out using the progressive encoder
    /// to verify that the encoder and decoder exactly the same.
    #[test]
    fn roundtrip_progressive_jpeg() {
        let file = read_file("iphoneprogressive", ".jpg");
        let enabled_features = crate::EnabledFeatures::compat_lepton_scalar_read();

        let reconstructed = roundtrip_jpeg(&mut std::io::Cursor::new(&file), &enabled_features);

        assert!(reconstructed == file);
    }

    #[test]
    fn test_benchmark_write_jpeg() {
        let mut f = benchmarks::benchmark_write_jpeg();
        for _ in 0..10 {
            f();
        }
    }

    #[test]
    fn test_benchmark_write_block() {
        let mut f = benchmarks::benchmark_write_block();
        for _ in 0..10 {
            f();
        }
    }
}

/// write a range of rows corresponding to the restart_info structure.
/// Returns the encoded data as a buffer.
///
/// Only works with baseline non-progressive images.
#[cfg(any(test, feature = "micro_benchmark"))]
fn jpeg_write_baseline_row_range(
    encoded_length: usize,
    restart_info: &RestartSegmentCodingInfo,
    image_data: &[BlockBasedImage],
    jpeg_header: &JpegHeader,
    rinfo: &ReconstructionInfo,
) -> Result<Vec<u8>> {
    let max_coded_heights: Vec<u32> = rinfo.truncate_components.get_max_coded_heights();

    let mut writer =
        JpegIncrementalWriter::new(encoded_length, rinfo, Some(restart_info), jpeg_header, 0);

    let mut decode_index = 0;
    loop {
        let cur_row: RowSpec = RowSpec::get_row_spec_from_index(
            decode_index,
            image_data,
            rinfo.truncate_components.mcu_count_vertical,
            &max_coded_heights,
        );

        decode_index += 1;

        if cur_row.done {
            break;
        }

        if cur_row.skip {
            continue;
        }

        if cur_row.luma_y < restart_info.luma_y_start {
            continue;
        }

        if cur_row.luma_y > restart_info.luma_y_end {
            break; // we're done here
        }

        writer.process_row(&cur_row, image_data).context()?;
    }

    Ok(writer.detach_buffer())
}

#[cfg(any(test, feature = "micro_benchmark"))]
pub mod benchmarks {
    use std::mem;

    use super::*;

    use crate::{
        EnabledFeatures,
        helpers::read_file,
        jpeg::{
            bit_writer::BitWriter,
            block_based_image::AlignedBlock,
            jpeg_header::{JpegHeader, ReconstructionInfo, generate_huff_table_from_distribution},
            jpeg_read::read_jpeg_file,
        },
    };

    /// Benchmarks performance of encoding a single JPEG block
    #[inline(never)]
    pub fn benchmark_write_block() -> Box<dyn FnMut()> {
        // create a weird distribution to test the huffman encoding for corner cases
        let mut dcdistribution = [0; 256];
        for i in 0..256 {
            dcdistribution[i] = 256 - i;
        }
        let dctbl = generate_huff_table_from_distribution(&dcdistribution);

        let mut acdistribution = [0; 256];
        for i in 0..256 {
            acdistribution[i] = 1 + 256;
        }
        let actbl = generate_huff_table_from_distribution(&acdistribution);

        let mut block = AlignedBlock::default();
        for i in 0..10 {
            block.get_block_mut()[i] = i as i16;
        }
        for i in 30..50 {
            block.get_block_mut()[i] = -(i as i16);
        }
        for i in 50..52 {
            block.get_block_mut()[i] = i as i16;
        }

        // we don't want to accumulate memory as we write, so reuse the same buffer
        // and clear it after each iteration.
        // This also avoids the cost of a malloc/free on each iteration.
        let mut storage = Vec::with_capacity(1024);
        Box::new(move || {
            let mut bitwriter = BitWriter::new(mem::take(&mut storage));
            encode_block_seq(&mut bitwriter, &dctbl, &actbl, &block);
            storage = bitwriter.detach_buffer();
            storage.clear();
        })
    }

    /// reads the jpeg file from the test data, parses it and then
    /// returns a closure that writes the jpeg blocks back out.
    #[inline(never)]
    pub fn benchmark_write_jpeg() -> Box<dyn FnMut()> {
        let file = read_file("android", ".jpg");

        let mut reader = std::io::Cursor::new(&file);
        let enabled_features = EnabledFeatures::compat_lepton_vector_write();

        let mut jpeg_header = JpegHeader::default();
        let mut rinfo = ReconstructionInfo::default();

        let (image_data, partitions, _end_scan) = read_jpeg_file(
            &mut reader,
            &mut jpeg_header,
            &mut rinfo,
            &enabled_features,
            |_, _| {},
        )
        .unwrap();

        Box::new(move || {
            let mut prev_offset = 0;
            for (offset, coding_info) in &partitions {
                use std::hint::black_box;

                let r = jpeg_write_baseline_row_range(
                    (offset - prev_offset) as usize,
                    &coding_info,
                    &image_data,
                    &jpeg_header,
                    &rinfo,
                )
                .unwrap();

                black_box(r);

                prev_offset = *offset;
            }
        })
    }
}


================================================
FILE: lib/src/jpeg/mod.rs
================================================
//! Module for reading and recreation of JPEGs without the loss of any information.
//!
//! This means that it should be possible to reconstruct bit-by-bit an exactly identical
//! JPEG file from the input.
//!
//! Note that we never actually decode the JPEG into pixels, since the DCT is lossy, so
//! processing needs to be done at the DCT coefficient level and keep the coefficients in
//! the BlockBasedImage identical.

mod bit_reader;
mod bit_writer;
mod component_info;
pub mod jpeg_code;
mod jpeg_position_state;

pub mod block_based_image;
pub mod jpeg_header;
pub mod jpeg_read;
pub mod jpeg_write;
pub mod row_spec;
pub mod truncate_components;


================================================
FILE: lib/src/jpeg/row_spec.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use crate::consts::COLOR_CHANNEL_NUM_BLOCK_TYPES;

use super::block_based_image::BlockBasedImage;

pub struct RowSpec {
    pub luma_y: u32,
    pub component: usize,
    pub curr_y: u32,
    pub mcu_row_index: u32,
    pub last_row_to_complete_mcu: bool,
    pub skip: bool,
    pub done: bool,
}

impl RowSpec {
    pub fn get_row_spec_from_index(
        decode_index: u32,
        image_data: &[BlockBasedImage],
        mcuv: u32, // number of mcus
        max_coded_heights: &[u32],
    ) -> RowSpec {
        assert!(
            image_data.len() <= COLOR_CHANNEL_NUM_BLOCK_TYPES,
            "image_data should match components count"
        );

        let num_cmp = image_data.len();

        let mut heights: Vec<u32> = Vec::with_capacity(num_cmp);
        let mut component_multiple: Vec<u32> = Vec::with_capacity(num_cmp);
        let mut mcu_multiple = 0;

        for i in 0..num_cmp {
            heights.push(image_data[i].get_original_height());
            component_multiple.push(heights[i] / mcuv);
            mcu_multiple += component_multiple[i];
        }

        let mcu_row = decode_index / mcu_multiple;
        let min_row_luma_y = mcu_row * component_multiple[0];
        let mut retval = RowSpec {
            skip: false,
            done: false,
            mcu_row_index: mcu_row,
            component: num_cmp,
            luma_y: min_row_luma_y,
            curr_y: 0,
            last_row_to_complete_mcu: false,
        };

        let mut place_within_scan = decode_index - (mcu_row * mcu_multiple);

        let mut i = num_cmp - 1;
        loop {
            if place_within_scan < component_multiple[i] {
                retval.component = i;
                retval.curr_y = (mcu_row * component_multiple[i]) + place_within_scan;
                retval.last_row_to_complete_mcu =
                    (place_within_scan + 1 == component_multiple[i]) && (i == 0);

                if retval.curr_y >= max_coded_heights[i] {
                    retval.skip = true;
                    retval.done = true; // assume true, but if we find something that needs coding, set false
                    for j in 0..num_cmp - 1 {
                        if mcu_row * component_multiple[j] < max_coded_heights[j] {
                            // we want to make sure to write out any partial rows,
                            // so set done only when all items in this mcu are really skips
                            // i.e. round down
                            retval.done = false;
                        }
                    }
                }

                if i == 0 {
                    retval.luma_y = retval.curr_y;
                }

                break;
            } else {
                place_within_scan -= component_multiple[i];
            }

            if i == 0 {
                retval.skip = true;
                retval.done = true;
                break;
            }

            i -= 1;
        }

        return retval;
    }
}


================================================
FILE: lib/src/jpeg/truncate_components.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::cmp;

use super::component_info::ComponentInfo;
use super::jpeg_header::JpegHeader;

#[derive(Debug, Clone)]
struct TrucateComponentsInfo {
    trunc_bcv: u32, // the number of vertical components in this (truncated) image

    trunc_bc: u32,
}

#[derive(Debug, Clone)]
pub struct TruncateComponents {
    trunc_info: Vec<TrucateComponentsInfo>,

    pub components_count: usize,

    pub mcu_count_horizontal: u32,

    pub mcu_count_vertical: u32,
}

impl Default for TruncateComponents {
    fn default() -> Self {
        return TruncateComponents {
            trunc_info: Vec::new(),
            components_count: 0,
            mcu_count_horizontal: 0,
            mcu_count_vertical: 0,
        };
    }
}

impl TruncateComponents {
    pub fn init(&mut self, jpeg_header: &JpegHeader) {
        self.mcu_count_horizontal = jpeg_header.mcuh.get();
        self.mcu_count_vertical = jpeg_header.mcuv.get();
        self.components_count = jpeg_header.cmpc;

        for i in 0..jpeg_header.cmpc {
            self.trunc_info.push(TrucateComponentsInfo {
                trunc_bcv: jpeg_header.cmp_info[i].bcv,
                trunc_bc: jpeg_header.cmp_info[i].bc,
            });
        }
    }

    pub fn get_max_coded_heights(&self) -> Vec<u32> {
        let mut retval = Vec::<u32>::new();

        for i in 0..self.components_count {
            retval.push(self.trunc_info[i].trunc_bcv);
        }
        return retval;
    }

    pub fn set_truncation_bounds(&mut self, jpeg_header: &JpegHeader, max_d_pos: [u32; 4]) {
        for i in 0..self.components_count {
            TruncateComponents::set_block_count_d_pos(
                &mut self.trunc_info[i],
                &jpeg_header.cmp_info[i],
                max_d_pos[i] + 1,
                self.mcu_count_vertical,
            );
        }
    }

    pub fn get_block_height(&self, cmp: usize) -> u32 {
        return self.trunc_info[cmp].trunc_bcv;
    }

    fn set_block_count_d_pos(
        ti: &mut TrucateComponentsInfo,
        ci: &ComponentInfo,
        trunc_bc: u32,
        mcu_count_vertical: u32,
    ) {
        assert!(
            ci.bcv == (ci.bc / ci.bch) + (if ci.bc % ci.bch != 0 { 1 } else { 0 }),
            "SetBlockCountDpos"
        );

        let mut vertical_scan_lines = cmp::min(
            (trunc_bc / ci.bch) + (if trunc_bc % ci.bch != 0 { 1 } else { 0 }),
            ci.bcv,
        );
        let ratio = TruncateComponents::get_min_vertical_extcmp_multiple(&ci, mcu_count_vertical);

        while vertical_scan_lines % ratio != 0 && vertical_scan_lines + 1 <= ci.bcv {
            vertical_scan_lines += 1;
        }

        assert!(
            vertical_scan_lines <= ci.bcv,
            "verticalScanLines <= ci.Info.bcv"
        );
        ti.trunc_bcv = vertical_scan_lines;
        ti.trunc_bc = trunc_bc;
    }

    fn get_min_vertical_extcmp_multiple(cmp_info: &ComponentInfo, mcu_count_vertical: u32) -> u32 {
        let luma_height = cmp_info.bcv;
        return luma_height / mcu_count_vertical;
    }

    pub fn get_component_sizes_in_blocks(&self) -> Vec<u32> {
        let mut retval = Vec::new();
        for i in 0..self.components_count {
            retval.push(self.trunc_info[i].trunc_bc);
        }
        return retval;
    }
}


================================================
FILE: lib/src/lepton_error.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::fmt::Display;
use std::io::ErrorKind;
use std::num::TryFromIntError;

#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(dead_code)]
#[non_exhaustive]
/// Well-defined errors for bad things that are expected to happen as part of compression/decompression
pub enum ExitCode {
    /// Assertion failure, which indicates probably indicated a bug in the library.
    AssertionFailure = 1,
    //CodingError = 2
    /// The JPEG file is too short to be a valid JPEG file.
    ShortRead = 3,

    /// We don't support 4-color JPEGs.
    Unsupported4Colors = 4,

    /// The coefficients in the JPEG file are out of range specified by the JPEG standard.
    CoefficientOutOfRange = 6,

    /// The lepton file has a coding error in the arithmetic coding part.
    StreamInconsistent = 7,

    /// The JPEG file is progressive, and progressive support is not enabled.
    ProgressiveUnsupported = 8,

    /// The JPEG file has a sampling factor that is not supported by the library.
    SamplingBeyondTwoUnsupported = 10,
    //SamplingBeyondFourUnsupported = 11,
    //ThreadingPartialMcu = 12,
    /// The lepton file is a version that is not supported by the library.
    VersionUnsupported = 13,
    //OnlyGarbageNoJpeg = 14,
    /// An error was returned by an IO operation, for example if a BufRead
    /// passed in retrned an error.
    OsError = 33,
    //HeaderTooLarge = 34,
    //BlockOffsetOOM = 37,
    /// The JPEG cannot be encoded due to a non-standard feature that is not supported by the library.
    UnsupportedJpeg = 42,

    /// The JPEG file has a zero IDCT, which is not supported by the library.
    /// Although the C++ library doesn't explicitly disallow this, it will lead to
    /// undefined behavior depending on C++, since it can lead to a division-by-zero.
    UnsupportedJpegWithZeroIdct0 = 43,

    /// The JPEG file has invalid reset codes in the stream
    InvalidResetCode = 44,

    /// The JPEG uses inconsistent padding, which is not supported by the library.
    InvalidPadding = 45,
    //WrapperOutputWriteFailed = 101,
    /// The Lepton file is not a valid Lepton file.
    BadLeptonFile = 102,

    /// An error occurred while sending a message to a thread in the thread pool.
    ChannelFailure = 103,

    /// error occured while casting an integer to a smaller type, most likely
    /// means that the JPEG contains invalid data
    IntegerCastOverflow = 1000,
    //CompressionFailedForAllChunks = 1001,
    //CompressedDataLargerThanPlainText = 1002,
    //HeaderChecksumMismatch = 1003,
    /// We verified against the original JPEG file but the regenerated length was different
    VerificationLengthMismatch = 1004,

    /// We verified against the original JPEG file but the content was different (but same length)
    VerificationContentMismatch = 1005,

    /// Caller passed in invalid parameters
    SyntaxError = 1006,

    /// The file to be read was not found (only used by utility exe)
    FileNotFound = 1007,

    /// An external verification failed (only used by utility exe when verifying
    /// against C++ Lepton implementation)
    ExternalVerificationFailed = 1008,

    /// ran out of memory trying to allocate a buffer
    OutOfMemory = 2000,
}

impl Display for ExitCode {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "{:?}", self)
    }
}

impl ExitCode {
    /// Converts the error code into an integer for use as an error code when
    /// returning from a C API.
    pub fn as_integer_error_code(self) -> i32 {
        self as i32
    }
}

/// Since errors are rare and stop everything, we want them to be as lightweight as possible.
#[derive(Debug, Clone)]
struct LeptonErrorInternal {
    exit_code: ExitCode,
    message: String,
}

/// Standard error returned by Lepton library
#[derive(Debug, Clone)]
pub struct LeptonError {
    i: Box<LeptonErrorInternal>,
}

pub type Result<T> = std::result::Result<T, LeptonError>;

impl Display for LeptonError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{0}: {1}", self.i.exit_code, self.i.message)
    }
}

impl LeptonError {
    /// Creates a new LeptonError with the specified exit code and message.
    pub fn new(exit_code: ExitCode, message: impl AsRef<str>) -> LeptonError {
        LeptonError {
            i: Box::new(LeptonErrorInternal {
                exit_code,
                message: message.as_ref().to_owned(),
            }),
        }
    }

    /// Returns the numeric exit code of the error to clasify the error
    pub fn exit_code(&self) -> ExitCode {
        self.i.exit_code
    }

    /// Returns the message of the error, which is a human-readable description of the error.
    pub fn message(&self) -> &str {
        &self.i.message
    }

    /// Adds context to the error by appending the current location in the code. This
    /// allows for building a callstack of where the error occurred.
    #[cold]
    #[inline(never)]
    #[track_caller]
    pub fn add_context(&mut self) {
        self.i
            .message
            .push_str(&format!("\n at {}", std::panic::Location::caller()));
    }
}

#[cold]
#[track_caller]
pub fn err_exit_code<T>(error_code: ExitCode, message: impl AsRef<str>) -> Result<T> {
    let mut e = LeptonError::new(error_code, message.as_ref());
    e.add_context();
    return Err(e);
}

pub trait AddContext<T> {
    #[track_caller]
    fn context(self) -> Result<T>;
}

impl<T, E: Into<LeptonError>> AddContext<T> for core::result::Result<T, E> {
    #[track_caller]
    fn context(self) -> Result<T> {
        match self {
            Ok(x) => Ok(x),
            Err(e) => {
                let mut e = e.into();
                e.add_context();
                Err(e)
            }
        }
    }
}

impl std::error::Error for LeptonError {}

fn get_io_error_exit_code(e: &std::io::Error) -> ExitCode {
    if e.kind() == ErrorKind::UnexpectedEof {
        ExitCode::ShortRead
    } else {
        ExitCode::OsError
    }
}

impl From<TryFromIntError> for LeptonError {
    #[track_caller]
    fn from(e: TryFromIntError) -> Self {
        let mut e = LeptonError::new(ExitCode::IntegerCastOverflow, e.to_string());
        e.add_context();
        e
    }
}

impl<T> From<std::sync::mpsc::SendError<T>> for LeptonError {
    #[track_caller]
    fn from(e: std::sync::mpsc::SendError<T>) -> Self {
        let mut e = LeptonError::new(ExitCode::ChannelFailure, e.to_string());
        e.add_context();
        e
    }
}
impl From<std::sync::mpsc::RecvError> for LeptonError {
    #[track_caller]
    fn from(e: std::sync::mpsc::RecvError) -> Self {
        let mut e = LeptonError::new(ExitCode::ChannelFailure, e.to_string());
        e.add_context();
        e
    }
}

/// translates std::io::Error into LeptonError
impl From<std::io::Error> for LeptonError {
    #[track_caller]
    fn from(e: std::io::Error) -> Self {
        match e.downcast::<LeptonError>() {
            Ok(le) => {
                return le;
            }
            Err(e) => {
                let mut e = LeptonError::new(get_io_error_exit_code(&e), e.to_string());
                e.add_context();
                e
            }
        }
    }
}

/// translates LeptonError into std::io::Error, which involves putting into a Box and using Other
impl From<LeptonError> for std::io::Error {
    fn from(e: LeptonError) -> Self {
        return std::io::Error::new(std::io::ErrorKind::Other, e);
    }
}

#[test]
fn test_error_translation() {
    // test wrapping inside an io error
    fn my_std_error() -> core::result::Result<(), std::io::Error> {
        Err(LeptonError::new(ExitCode::SyntaxError, "test error").into())
    }

    let e: LeptonError = my_std_error().unwrap_err().into();
    assert_eq!(e.exit_code(), ExitCode::SyntaxError);
    assert_eq!(e.message(), "test error");

    // an IO error should be translated into an OsError
    let e: LeptonError = std::io::Error::new(std::io::ErrorKind::NotFound, "file not found").into();
    assert_eq!(e.exit_code(), ExitCode::OsError);
}


================================================
FILE: lib/src/lib.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

//! A lossless JPEG compressor with precise bit-for-bit recovery, supporting both baseline and progressive JPEGs.
//! Achieves compression savings of around 22%, making it suitable for cold cloud storage use cases.
//!
//! This crate is a Rust port of Dropbox’s original [lepton](https://github.com/dropbox/lepton) JPEG compression tool.
//! It retains the performance characteristics of the C++ version while benefiting from Rust’s memory safety guarantees.
//! All JPEG content—including metadata and even malformed segments—is preserved accurately.
//!
//! The original C++ codebase has been deprecated by Dropbox. This Rust implementation incorporates
//! an exhaustive security review of the original, making it a safer and more maintainable alternative.

// Don't allow any unsafe code by default. Since this code has to potentially deal with
// badly/maliciously formatted images, we want this extra level of safety.
#![forbid(unsafe_code)]
#![forbid(trivial_casts)]
#![forbid(trivial_numeric_casts)]
#![forbid(non_ascii_idents)]
#![forbid(unused_extern_crates)]
#![forbid(unused_import_braces)]
#![forbid(redundant_lifetimes)]
#![forbid(single_use_lifetimes)]
#![forbid(unused_extern_crates)]
#![forbid(unused_lifetimes)]
#![forbid(unused_macro_rules)]
#![forbid(macro_use_extern_crate)]
#![forbid(missing_unsafe_on_extern)]
#![deny(missing_docs)]

mod consts;
mod helpers;
mod jpeg;
mod metrics;
mod structs;

mod enabled_features;
mod lepton_error;

pub use enabled_features::EnabledFeatures;
pub use helpers::catch_unwind_result;
pub use lepton_error::{ExitCode, LeptonError};
pub use metrics::{CpuTimeMeasure, Metrics};
pub use structs::lepton_file_writer::get_git_version;

use crate::lepton_error::{AddContext, Result};
pub use crate::structs::simple_threadpool::{
    DEFAULT_THREAD_POOL, LeptonThreadPool, LeptonThreadPriority, SimpleThreadPool,
    SingleThreadPool, ThreadPoolHolder,
};

#[cfg(feature = "micro_benchmark")]
/// Module that exposes internal functions for micro benchmarking
pub mod micro_benchmark;

/// Trait for types that can provide the current position in a stream. This
/// is intentionally a subset of the Seek trait, as it only requires remembering
/// the current position without allowing seeking to arbitrary positions.
///
/// This is useful for callers for which it would be complex to provide seek capabilities, but can
/// count the number of bytes read or written so far.
///
/// We provide a blanket implementation for any type that implements `std::io::Seek`.
pub trait StreamPosition {
    /// Returns the current position in the stream.
    fn position(&mut self) -> u64;
}

impl<T: std::io::Seek> StreamPosition for T {
    fn position(&mut self) -> u64 {
        self.stream_position().unwrap()
    }
}

pub use structs::lepton_file_reader::decode_lepton;

pub use structs::lepton_file_writer::{encode_lepton, encode_lepton_verify};

static PACKAGE_VERSION: &str = env!("CARGO_PKG_VERSION");

pub use structs::lepton_file_reader::LeptonFileReader;

/// Returns the version string of the library, which includes the package version and the git version.
/// This is useful for debugging and logging purposes to know the exact version of the library is being used
pub fn get_version_string() -> String {
    format!("{}-{}", PACKAGE_VERSION, get_git_version())
}

/// used by utility to dump out the contents of a jpeg file or lepton file for debugging purposes
#[allow(dead_code)]
pub fn dump_jpeg(input_data: &[u8], all: bool, enabled_features: &EnabledFeatures) -> Result<()> {
    use std::io::Cursor;
    use structs::lepton_file_reader::decode_lepton_file_image;
    use structs::lepton_file_writer::read_jpeg;

    let mut lh;
    let block_image;

    if input_data[0] == 0xff && input_data[1] == 0xd8 {
        let mut reader = Cursor::new(input_data);

        (lh, block_image) = read_jpeg(&mut reader, enabled_features, |jh, _ri| {
            println!("parsed header:");
            let s = format!("{jh:?}");
            println!("{0}", s.replace("},", "},\r\n").replace("],", "],\r\n"));
        })?;
    } else {
        let mut reader = Cursor::new(input_data);

        (lh, block_image) =
            decode_lepton_file_image(&mut reader, enabled_features, &DEFAULT_THREAD_POOL)
                .context()?;

        loop {
            println!("parsed header:");
            let s = format!("{0:?}", lh.jpeg_header);
            println!("{0}", s.replace("},", "},\r\n").replace("],", "],\r\n"));

            if !lh
                .advance_next_header_segment(&enabled_features)
                .context()?
            {
                break;
            }
        }
    }

    let s = format!("{lh:?}");
    println!("{0}", s.replace("},", "},\r\n").replace("],", "],\r\n"));

    if all {
        for i in 0..block_image.len() {
            println!("Component {0}", i);
            let image = &block_image[i];
            for dpos in 0..image.get_block_width() * image.get_original_height() {
                print!("dpos={0} ", dpos);
                let block = image.get_block(dpos);

                print!("{0}", block.get_transposed_from_zigzag(0));
                for i in 1..64 {
                    print!(",{0}", block.get_transposed_from_zigzag(i));
                }
                println!();
            }
        }
    }

    return Ok(());
}


================================================
FILE: lib/src/metrics.rs
================================================
use std::collections::HashMap;
use std::time::Duration;

#[cfg(windows)]
use cpu_time::ThreadTime;

/// platform independent threadtime measurement
pub struct CpuTimeMeasure {
    #[cfg(windows)]
    start: ThreadTime,
    #[cfg(not(windows))]
    start: std::time::Instant,
}

impl CpuTimeMeasure {
    /// Creates a new CpuTimeMeasure instance that starts measuring time.
    pub fn new() -> Self {
        Self {
            #[cfg(windows)]
            start: ThreadTime::now(),
            #[cfg(not(windows))]
            start: std::time::Instant::now(),
        }
    }

    /// Returns the elapsed time since the CpuTimeMeasure instance was created.
    pub fn elapsed(&self) -> Duration {
        #[cfg(windows)]
        {
            self.start.elapsed()
        }
        #[cfg(not(windows))]
        {
            self.start.elapsed()
        }
    }
}

#[derive(Debug, PartialEq, Copy, Clone, Hash, Eq)]
pub enum ModelSubComponent {
    Exp,
    Sign,
    Residual,
    Noise,
}

#[derive(Debug, PartialEq, Copy, Clone, Hash, Eq)]
#[repr(u8)]
pub enum ModelComponent {
    Dummy,
    Coef(ModelSubComponent),
    DC(ModelSubComponent),
    Edge(ModelSubComponent),
    NonZero7x7Count,
    NonZeroEdgeCount,
}

#[derive(Default, Debug)]
pub struct ModelComponentStatistics {
    pub total_bits: i64,
    pub total_compressed: i64,
}

/// Metrics for the Lepton JPEG compression and decompression process.
#[derive(Default, Debug)]
pub struct Metrics {
    map: HashMap<ModelComponent, ModelComponentStatistics>,
    cpu_time_worker_time: Duration,
}

impl Metrics {
    /// Records the compression statistics for a specific model component.
    #[allow(dead_code)]
    pub fn record_compression_stats(
        &mut self,
        cmp: ModelComponent,
        total_bits: i64,
        total_compressed: i64,
    ) {
        let e = self
            .map
            .entry(cmp)
            .or_insert(ModelComponentStatistics::default());
        e.total_bits += total_bits;
        e.total_compressed += total_compressed;
    }

    /// Records the CPU worker time for the compression process.
    pub fn record_cpu_worker_time(&mut self, duration: Duration) {
        self.cpu_time_worker_time += duration;
    }

    /// Returns the total number of bits processed for a specific model component.
    #[allow(dead_code)]
    pub fn print_metrics(&self) {
        let mut sort_vec = Vec::new();
        for x in &self.map {
            sort_vec.push((x.0, x.1));
        }

        sort_vec.sort_by(|a, b| a.1.total_compressed.cmp(&b.1.total_compressed).reverse());

        let total_compressed: i64 = sort_vec.iter().map(|x| x.1.total_compressed).sum();

        for x in &sort_vec {
            let name = format!("{0:?}", x.0);

            println!(
                "{0:16} total_bits={1:9} compressed_bits={2:9} ratio={3:4} comp_delta={4:10}k storage={5:0.1}%, comp={6:0.2}%)",
                name,
                x.1.total_bits,
                x.1.total_compressed,
                x.1.total_compressed * 100 / x.1.total_bits,
                (x.1.total_bits - x.1.total_compressed) / (8 * 1024),
                (x.1.total_compressed as f64) * 100f64 / (total_compressed as f64),
                ((x.1.total_bits - x.1.total_compressed) as f64) / (total_compressed as f64)
                    * 100f64
            );
        }

        println!(
            "total_compressed = {0} bits, {1} bytes",
            total_compressed,
            total_compressed / 8
        );
        println!("worker_cpu={0}ms", self.cpu_time_worker_time.as_millis());
    }

    /// empties the metrics and returns the collected data
    pub fn drain(&mut self) -> Metrics {
        let cpu_time_worker_time = self.cpu_time_worker_time;
        self.cpu_time_worker_time = Duration::default();

        Metrics {
            map: self.map.drain().collect(),
            cpu_time_worker_time,
        }
    }

    /// Returns the total CPU worker time recorded in the metrics.
    pub fn get_cpu_time_worker_time(&self) -> Duration {
        self.cpu_time_worker_time
    }

    /// Merges another Metrics instance into this one, summing the statistics.
    pub fn merge_from(&mut self, mut source_metrics: Metrics) {
        for x in source_metrics.map.drain() {
            let e = self
                .map
                .entry(x.0)
                .or_insert(ModelComponentStatistics::default());
            e.total_bits += x.1.total_bits;
            e.total_compressed += x.1.total_compressed;
        }

        self.cpu_time_worker_time += source_metrics.cpu_time_worker_time;
    }
}


================================================
FILE: lib/src/micro_benchmark.rs
================================================
pub use crate::structs::{benchmark_idct, benchmark_roundtrip_coefficient};

pub use crate::jpeg::jpeg_write::benchmarks::benchmark_write_jpeg;

pub use crate::jpeg::jpeg_read::benchmarks::benchmark_read_jpeg;

pub use crate::jpeg::jpeg_write::benchmarks::benchmark_write_block;

pub use crate::jpeg::jpeg_read::benchmarks::benchmark_read_block;


================================================
FILE: lib/src/structs/block_context.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use crate::jpeg::block_based_image::{AlignedBlock, BlockBasedImage, EMPTY_BLOCK};
use crate::structs::neighbor_summary::{NEIGHBOR_DATA_EMPTY, NeighborSummary};
use crate::structs::probability_tables::ProbabilityTables;
pub struct BlockContext {
    block_width: u32,
    cur_block_index: u32,
    cur_neighbor_summary_index: u32,
    above_neighbor_summary_index: u32,
}
pub struct NeighborData<'a> {
    pub above: &'a AlignedBlock,
    pub left: &'a AlignedBlock,
    pub above_left: &'a AlignedBlock,
    pub neighbor_context_above: &'a NeighborSummary,
    pub neighbor_context_left: &'a NeighborSummary,
}

impl BlockContext {
    /// Create a new BlockContext for the first line of the image at a given y-coordinate.
    pub fn off_y(y: u32, image_data: &BlockBasedImage) -> BlockContext {
        let block_width = image_data.get_block_width();

        let cur_block_index = block_width * y;

        // blocks above the first line are never dereferenced
        let cur_neighbor_summary_index = if (y & 1) != 0 { block_width } else { 0 };

        let above_neighbor_summary_index = if (y & 1) != 0 { 0 } else { block_width };

        BlockContext {
            cur_block_index,
            block_width,
            cur_neighbor_summary_index,
            above_neighbor_summary_index,
        }
    }

    // for debugging
    #[allow(dead_code)]
    pub fn get_here_index(&self) -> u32 {
        self.cur_block_index
    }

    // as each new line BlockContext is set by `off_y`, no edge cases with dereferencing
    // out of bounds indices is possible, therefore no special treatment is needed
    pub fn next(&mut self) -> u32 {
        self.cur_block_index += 1;
        self.cur_neighbor_summary_index += 1;
        self.above_neighbor_summary_index += 1;

        self.cur_block_index
    }

    pub fn here<'a>(&self, image_data: &'a BlockBasedImage) -> &'a AlignedBlock {
        let retval = image_data.get_block(self.cur_block_index);
        return retval;
    }

    pub fn get_neighbor_data<'a, const ALL_PRESENT: bool>(
        &self,
        image_data: &'a BlockBasedImage,
        neighbor_summary: &'a [NeighborSummary],
        pt: &ProbabilityTables,
    ) -> NeighborData<'a> {
        NeighborData::<'a> {
            above_left: if ALL_PRESENT {
                image_data.get_block(self.cur_block_index - self.block_width - 1)
            } else {
                &EMPTY_BLOCK
            },
            above: if ALL_PRESENT || pt.is_above_present() {
                image_data.get_block(self.cur_block_index - self.block_width)
            } else {
                &EMPTY_BLOCK
            },
            left: if ALL_PRESENT || pt.is_left_present() {
                image_data.get_block(self.cur_block_index - 1)
            } else {
                &EMPTY_BLOCK
            },
            neighbor_context_above: if ALL_PRESENT || pt.is_above_present() {
                &neighbor_summary[self.above_neighbor_summary_index as usize]
            } else {
                &NEIGHBOR_DATA_EMPTY
            },
            neighbor_context_left: if ALL_PRESENT || pt.is_left_present() {
                &neighbor_summary[(self.cur_neighbor_summary_index - 1) as usize]
            } else {
                &NEIGHBOR_DATA_EMPTY
            },
        }
    }

    pub fn set_neighbor_summary_here(
        &self,
        neighbor_summary_cache: &mut [NeighborSummary],
        neighbor_summary: NeighborSummary,
    ) {
        neighbor_summary_cache[self.cur_neighbor_summary_index as usize] = neighbor_summary;
    }
}


================================================
FILE: lib/src/structs/branch.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

/*
 The logic here is different here than the C++ version, resulting in
 a 2x speed increase. Nothing magic, the main change is to not
 store the probability, since it is deterministically determined
 based on the true/false counts. Instead of doing the calculation,
 we just lookup the 16-bit value in a lookup table to get the
 corresponding probabiity.
*/

pub struct Branch {
    /// The top byte is the number of false bits seen so far
    /// and the bottom byte is the number of true bits seen.
    /// On overflow both values are normalized by dividing by 2 (rounding up).
    ///
    /// Both counts are never less than 1, so we start off with 0x0101.
    counts: u16,
}

impl Default for Branch {
    fn default() -> Branch {
        Branch::new()
    }
}

/// used to precalculate the probabilities and store them as a const array
const fn problookup() -> [u8; 65536] {
    let mut retval = [0; 65536];
    let mut i = 1i32;
    while i < 65536 {
        let a = i >> 8;
        let b = i & 0xff;

        retval[i as usize] = ((a << 8) / (a + b)) as u8;
        i += 1;
    }

    return retval;
}

/// precalculated probabilities for the next bit being false
static PROB_LOOKUP: [u8; 65536] = problookup();

impl Branch {
    pub fn new() -> Self {
        Branch { counts: 0x0101 }
    }

    /// used for testing to set counts to a specific value
    #[cfg(test)]
    pub fn set_count(&mut self, count: u16) {
        self.counts = count;
    }

    /// used for testing to set counts to a specific value
    #[cfg(test)]
    pub fn get_count(&self) -> u16 {
        self.counts
    }

    /// used for debugging to keep the state for hashing
    #[allow(dead_code)]
    pub fn get_u64(&self) -> u64 {
        let c = self.counts;
        return ((PROB_LOOKUP[self.counts as usize] as u64) << 16) + c as u64;
    }

    /// Returns the probability of the next bit being a false as a value between 1 and 255
    ///
    /// Calculated by looking up the probability in a precalculated table
    /// where 'f' is the number of false bits and 't' is the number of true bits seen.
    ///
    /// (f * 256) / (f + t)
    #[inline(always)]
    pub fn get_probability(&self) -> u8 {
        PROB_LOOKUP[self.counts as usize]
    }

    /// Updates the counters when we encounter a 1 or 0. If we hit 255 values, then
    /// we normalize both counts (divide by 2), except in the case where the remaining value is 1,
    /// in which case we don't touch. This biases the probability to get better results
    /// when there are long runs of 1 or 0.
    ///
    /// This function merges updating either the true or false counter
    /// by swapping the top and bottom byte of the 16-bit value.
    ///
    /// The update algorithm looks like this (with top and bottom swapped depending on 'bit'):
    ///
    /// if top_byte < 0xff {
    ///  top_byte += 1;
    /// } else if bottom_byte != 1 {
    ///  top_byte = 0x81;
    ///  bottom_byte = (bottom_byte + 1) >> 1;
    /// }
    #[inline(always)]
    pub fn record_and_update_bit(&mut self, bit: bool) {
        // rotation is used to update either the true or false counter
        // this allows the same code to be used without branching,
        // which makes the CPU about 20% happier.
        //
        // Since the bits are randomly 1/0, the CPU branch predictor does
        // a terrible job and ends up wasting a lot of time. Normally
        // branches are a better idea if the branch very predictable vs
        // this case where it is better to always pay the price of the
        // extra rotation to avoid the branch.
        let orig = self.counts.rotate_left(bit as u32 * 8);
        let (mut sum, o) = orig.overflowing_add(0x100);
        if o {
            // normalize, except in special case where we have 0xff or more same bits in a row
            // in which case we want to bias the probability to get better compression
            //
            // CPU branch prediction soon realizes that this section is not often executed
            // and will optimize for the common case where the counts are not 0xff.
            let mask = if orig == 0xff01 { 0xff00 } else { 0x8100 };

            // upper byte is 0 since we incremented 0xffxx so we don't have to mask it
            sum = ((1 + sum) >> 1) | mask;
        }

        self.counts = sum.rotate_left(bit as u32 * 8);
    }
}

#[test]
fn test_branch_update_false() {
    let mut b = Branch { counts: 0x0101 };
    b.record_and_update_bit(false);
    assert_eq!(b.counts, 0x0201);

    b.counts = 0x80ff;
    b.record_and_update_bit(false);
    assert_eq!(b.counts, 0x81ff);

    b.counts = 0xff01;
    b.record_and_update_bit(false);
    assert_eq!(b.counts, 0xff01);

    b.counts = 0xff02;
    b.record_and_update_bit(false);
    assert_eq!(b.counts, 0x8101);

    b.counts = 0xffff;
    b.record_and_update_bit(false);
    assert_eq!(b.counts, 0x8180);
}

#[test]
fn test_branch_update_true() {
    let mut b = Branch { counts: 0x0101 };
    b.record_and_update_bit(true);
    assert_eq!(b.counts, 0x0102);

    b.counts = 0xff80;
    b.record_and_update_bit(true);
    assert_eq!(b.counts, 0xff81);

    b.counts = 0x01ff;
    b.record_and_update_bit(true);
    assert_eq!(b.counts, 0x01ff);

    b.counts = 0x02ff;
    b.record_and_update_bit(true);
    assert_eq!(b.counts, 0x0181);

    b.counts = 0xffff;
    b.record_and_update_bit(true);
    assert_eq!(b.counts, 0x8081);
}

/// run through all the possible combinations of counts and ensure that the probability is the same
#[test]
fn test_all_probabilities() {
    /// This is copied from the C++ implementation to ensure that the behavior is the same
    struct OriginalImplForTest {
        counts: [u8; 2],
        probability: u8,
    }

    impl OriginalImplForTest {
        fn true_count(&self) -> u32 {
            return self.counts[1] as u32;
        }
        fn false_count(&self) -> u32 {
            return self.counts[0] as u32;
        }

        fn record_obs_and_update(&mut self, obs: bool) {
            let fcount = self.counts[0] as u32;
            let tcount = self.counts[1] as u32;

            let overflow = self.counts[obs as usize] == 0xff;

            if overflow {
                // check less than 512
                let neverseen = self.counts[!obs as usize] == 1;
                if neverseen {
                    self.counts[obs as usize] = 0xff;
                    self.probability = if obs { 0 } else { 255 };
                } else {
                    self.counts[0] = ((1 + fcount) >> 1) as u8;
                    self.counts[1] = ((1 + tcount) >> 1) as u8;
                    self.counts[obs as usize] = 129;
                    self.probability = self.optimize(self.counts[0] as u32 + self.counts[1] as u32);
                }
            } else {
                self.counts[obs as usize] += 1;
                self.probability = self.optimize(fcount + tcount + 1);
            }
        }

        fn optimize(&self, sum: u32) -> u8 {
            let prob = (self.false_count() << 8) / sum;

            prob as u8
        }
    }

    for i in 0u16..=65535 {
        let mut old_f = OriginalImplForTest {
            counts: [(i >> 8) as u8, i as u8],
            probability: 0,
        };

        if old_f.true_count() == 0 || old_f.false_count() == 0 {
            // starting counts can't be zero (we use 0 as an internal special value for the new implementation for the edge case of many trues in a row)
            continue;
        }

        let mut new_f = Branch { counts: i };

        for _k in 0..10 {
            old_f.record_obs_and_update(false);
            new_f.record_and_update_bit(false);
            assert_eq!(old_f.probability, new_f.get_probability());
        }

        let mut old_t = OriginalImplForTest {
            counts: [(i >> 8) as u8, i as u8],
            probability: 0,
        };
        let mut new_t = Branch { counts: i };

        for _k in 0..10 {
            old_t.record_obs_and_update(true);
            new_t.record_and_update_bit(true);

            if old_t.probability == 0 {
                // there is a change of behavior here compared to the C++ version,
                // but because of the way split is calculated it doesn't result in an
                // overall change in the way that encoding is done, but it does simplify
                // one of the corner cases.
                assert_eq!(new_t.get_probability(), 1);
            } else {
                assert_eq!(old_t.probability, new_t.get_probability());
            }
        }
    }
}


================================================
FILE: lib/src/structs/idct.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use bytemuck::cast;
use wide::{i16x8, i32x8};

use crate::jpeg::block_based_image::AlignedBlock;

const _W1: i32 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
const _W2: i32 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
const _W3: i32 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
const _W5: i32 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
const _W6: i32 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
const _W7: i32 = 565; // 2048*sqrt(2)*cos(7*pi/16)

const W3: i32 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
const W6: i32 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
const W7: i32 = 565; // 2048*sqrt(2)*cos(7*pi/16)

const W1PW7: i32 = _W1 + _W7;
const W1MW7: i32 = _W1 - _W7;
const W2PW6: i32 = _W2 + _W6;
const W2MW6: i32 = _W2 - _W6;
const W3PW5: i32 = _W3 + _W5;
const W3MW5: i32 = _W3 - _W5;

const R2: i32 = 181; // 256/sqrt(2)

#[inline(always)]
pub fn run_idct(block: &[i32x8; 8]) -> AlignedBlock {
    let t = *block;

    let mut xv0 = (t[0] << 11) + 128;
    let mut xv1 = t[1];
    let mut xv2 = t[2];
    let mut xv3 = t[3];
    let mut xv4 = t[4] << 11;
    let mut xv5 = t[5];
    let mut xv6 = t[6];
    let mut xv7 = t[7];

    // Stage 1.
    let mut xv8 = _W7 * (xv1 + xv7);
    xv1 = xv8 + (W1MW7 * xv1);
    xv7 = xv8 - (W1PW7 * xv7);
    xv8 = _W3 * (xv5 + xv3);
    xv5 = xv8 - (W3MW5 * xv5);
    xv3 = xv8 - (W3PW5 * xv3);

    // Stage 2.
    xv8 = xv0 + xv4;
    xv0 -= xv4;
    xv4 = W6 * (xv2 + xv6);
    xv6 = xv4 - (W2PW6 * xv6);
    xv2 = xv4 + (W2MW6 * xv2);
    xv4 = xv1 + xv5;
    xv1 -= xv5;
    xv5 = xv7 + xv3;
    xv7 -= xv3;

    // Stage 3.
    xv3 = xv8 + xv2;
    xv8 -= xv2;
    xv2 = xv0 + xv6;
    xv0 -= xv6;
    xv6 = ((R2 * (xv1 + xv7)) + 128) >> 8;
    xv1 = ((R2 * (xv1 - xv7)) + 128) >> 8;

    // Stage 4.
    let row = [
        (xv3 + xv4) >> 8,
        (xv2 + xv6) >> 8,
        (xv0 + xv1) >> 8,
        (xv8 + xv5) >> 8,
        (xv8 - xv5) >> 8,
        (xv0 - xv1) >> 8,
        (xv2 - xv6) >> 8,
        (xv3 - xv4) >> 8,
    ];

    // transpose and now do vertical
    let [
        mut yv0,
        mut yv1,
        mut yv2,
        mut yv3,
        mut yv4,
        mut yv5,
        mut yv6,
        mut yv7,
    ] = i32x8::transpose(row);

    yv0 = (yv0 << 8) + 8192;
    yv4 = yv4 << 8;

    // Stage 1.
    let mut yv8 = (W7 * (yv1 + yv7)) + 4;
    yv1 = (yv8 + (W1MW7 * yv1)) >> 3;
    yv7 = (yv8 - (W1PW7 * yv7)) >> 3;
    yv8 = (W3 * (yv5 + yv3)) + 4;
    yv5 = (yv8 - (W3MW5 * yv5)) >> 3;
    yv3 = (yv8 - (W3PW5 * yv3)) >> 3;

    // Stage 2.
    yv8 = yv0 + yv4;
    yv0 -= yv4;
    yv4 = ((W6) * (yv2 + yv6)) + 4;
    yv6 = (yv4 - (W2PW6 * yv6)) >> 3;
    yv2 = (yv4 + (W2MW6 * yv2)) >> 3;
    yv4 = yv1 + yv5;
    yv1 -= yv5;
    yv5 = yv7 + yv3;
    yv7 -= yv3;

    // Stage 3.
    yv3 = yv8 + yv2;
    yv8 -= yv2;
    yv2 = yv0 + yv6;
    yv0 -= yv6;
    yv6 = ((R2 * (yv1 + yv7)) + 128) >> 8;
    yv1 = ((R2 * (yv1 - yv7)) + 128) >> 8;

    // Stage 4.
    AlignedBlock::new(cast([
        i16x8::from_i32x8_truncate((yv3 + yv4) >> 11),
        i16x8::from_i32x8_truncate((yv2 + yv6) >> 11),
        i16x8::from_i32x8_truncate((yv0 + yv1) >> 11),
        i16x8::from_i32x8_truncate((yv8 + yv5) >> 11),
        i16x8::from_i32x8_truncate((yv8 - yv5) >> 11),
        i16x8::from_i32x8_truncate((yv0 - yv1) >> 11),
        i16x8::from_i32x8_truncate((yv2 - yv6) >> 11),
        i16x8::from_i32x8_truncate((yv3 - yv4) >> 11),
    ]))
}

#[cfg(test)]
use bytemuck::cast_ref;

#[cfg(test)]
#[inline(always)]
fn get_q(offset: usize, q_transposed: &AlignedBlock) -> i32x8 {
    use wide::u16x8;

    let rows: &[u16x8; 8] = cast_ref(q_transposed.get_block());
    i32x8::from_u16x8(rows[offset])
}

#[cfg(test)]
#[inline(always)]
fn get_c(offset: usize, q_transposed: &AlignedBlock) -> i32x8 {
    let rows: &[i16x8; 8] = cast_ref(q_transposed.get_block());
    i32x8::from_i16x8(rows[offset])
}

#[cfg(test)]
fn test_idct(test_data: &AlignedBlock, test_q: &[u16; 64]) {
    use std::num::Wrapping;

    fn mul(a: i16, b: u16) -> Wrapping<i32> {
        return Wrapping(a as i32) * Wrapping(b as i32);
    }

    pub fn run_idct_old(
        block: &AlignedBlock,
        q: &[u16; 64],
        outp: &mut [i16; 64],
        ignore_dc: bool,
    ) {
        let mut intermed = [Wrapping(0i32); 64];

        // Horizontal 1-D IDCT.
        for y in 0..8 {
            let y8: usize = y * 8;

            let mut x0 = if ignore_dc && y == 0 {
                Wrapping(0)
            } else {
                mul(block.get_coefficient(y8 + 0), q[y8 + 0]) << 11
            } + Wrapping(128);
            let mut x1 = mul(block.get_coefficient(y8 + 4), q[y8 + 4]) << 11;
            let mut x2 = mul(block.get_coefficient(y8 + 6), q[y8 + 6]);
            let mut x3 = mul(block.get_coefficient(y8 + 2), q[y8 + 2]);
            let mut x4 = mul(block.get_coefficient(y8 + 1), q[y8 + 1]);
            let mut x5 = mul(block.get_coefficient(y8 + 7), q[y8 + 7]);
            let mut x6 = mul(block.get_coefficient(y8 + 5), q[y8 + 5]);
            let mut x7 = mul(block.get_coefficient(y8 + 3), q[y8 + 3]);

            // If all the AC components are zero, then the IDCT is trivial.
            if x1 == Wrapping(0)
                && x2 == Wrapping(0)
                && x3 == Wrapping(0)
                && x4 == Wrapping(0)
                && x5 == Wrapping(0)
                && x6 == Wrapping(0)
                && x7 == Wrapping(0)
            {
                let dc = (x0 - Wrapping(128)) >> 8;
                intermed[y8 + 0] = dc;
                intermed[y8 + 1] = dc;
                intermed[y8 + 2] = dc;
                intermed[y8 + 3] = dc;
                intermed[y8 + 4] = dc;
                intermed[y8 + 5] = dc;
                intermed[y8 + 6] = dc;
                intermed[y8 + 7] = dc;
                continue;
            }

            // Prescale.

            // Stage 1.
            let mut x8 = Wrapping(W7) * (x4 + x5);
            x4 = x8 + (Wrapping(W1MW7) * x4);
            x5 = x8 - (Wrapping(W1PW7) * x5);
            x8 = Wrapping(W3) * (x6 + x7);
            x6 = x8 - (Wrapping(W3MW5) * x6);
            x7 = x8 - (Wrapping(W3PW5) * x7);

            // Stage 2.
            x8 = x0 + x1;
            x0 -= x1;
            x1 = Wrapping(W6) * (x3 + x2);
            x2 = x1 - (Wrapping(W2PW6) * x2);
            x3 = x1 + (Wrapping(W2MW6) * x3);
            x1 = x4 + x6;
            x4 -= x6;
            x6 = x5 + x7;
            x5 -= x7;

            // Stage 3.
            x7 = x8 + x3;
            x8 -= x3;
            x3 = x0 + x2;
            x0 -= x2;
            x2 = ((Wrapping(R2) * (x4 + x5)) + Wrapping(128)) >> 8;
            x4 = ((Wrapping(R2) * (x4 - x5)) + Wrapping(128)) >> 8;

            // Stage 4.
            intermed[y8 + 0] = (x7 + x1) >> 8;
            intermed[y8 + 1] = (x3 + x2) >> 8;
            intermed[y8 + 2] = (x0 + x4) >> 8;
            intermed[y8 + 3] = (x8 + x6) >> 8;
            intermed[y8 + 4] = (x8 - x6) >> 8;
            intermed[y8 + 5] = (x0 - x4) >> 8;
            intermed[y8 + 6] = (x3 - x2) >> 8;
            intermed[y8 + 7] = (x7 - x1) >> 8;
        }

        // Vertical 1-D IDCT.
        for x in 0..8 {
            // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
            // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
            // we do not bother to check for the all-zero case.

            // Prescale.
            let mut y0 = (intermed[(8 * 0) + x] << 8) + Wrapping(8192);
            let mut y1 = intermed[(8 * 4) + x] << 8;
            let mut y2 = intermed[(8 * 6) + x];
            let mut y3 = intermed[(8 * 2) + x];
            let mut y4 = intermed[(8 * 1) + x];
            let mut y5 = intermed[(8 * 7) + x];
            let mut y6 = intermed[(8 * 5) + x];
            let mut y7 = intermed[(8 * 3) + x];

            // Stage 1.
            let mut y8 = (Wrapping(W7) * (y4 + y5)) + Wrapping(4);
            y4 = (y8 + (Wrapping(W1MW7) * y4)) >> 3;
            y5 = (y8 - (Wrapping(W1PW7) * y5)) >> 3;
            y8 = (Wrapping(W3) * (y6 + y7)) + Wrapping(4);
            y6 = (y8 - (Wrapping(W3MW5) * y6)) >> 3;
            y7 = (y8 - (Wrapping(W3PW5) * y7)) >> 3;

            // Stage 2.
            y8 = y0 + y1;
            y0 -= y1;
            y1 = (Wrapping(W6) * (y3 + y2)) + Wrapping(4);
            y2 = (y1 - (Wrapping(W2PW6) * y2)) >> 3;
            y3 = (y1 + (Wrapping(W2MW6) * y3)) >> 3;
            y1 = y4 + y6;
            y4 -= y6;
            y6 = y5 + y7;
            y5 -= y7;

            // Stage 3.
            y7 = y8 + y3;
            y8 -= y3;
            y3 = y0 + y2;
            y0 -= y2;
            y2 = ((Wrapping(R2) * (y4 + y5)) + Wrapping(128)) >> 8;
            y4 = ((Wrapping(R2) * (y4 - y5)) + Wrapping(128)) >> 8;

            // Stage 4.
            outp[(8 * 0) + x] = ((y7 + y1) >> 11).0 as i16;
            outp[(8 * 1) + x] = ((y3 + y2) >> 11).0 as i16;
            outp[(8 * 2) + x] = ((y0 + y4) >> 11).0 as i16;
            outp[(8 * 3) + x] = ((y8 + y6) >> 11).0 as i16;
            outp[(8 * 4) + x] = ((y8 - y6) >> 11).0 as i16;
            outp[(8 * 5) + x] = ((y0 - y4) >> 11).0 as i16;
            outp[(8 * 6) + x] = ((y3 - y2) >> 11).0 as i16;
            outp[(8 * 7) + x] = ((y7 - y1) >> 11).0 as i16;
        }
    }

    let q = AlignedBlock::new(cast(*test_q));
    let data_tr = test_data.transpose();
    let q_tr = q.transpose();

    let mut raster: [i32x8; 8] = [0.into(); 8]; // transposed
    for col in 0..8 {
        raster[col] = get_c(col, &data_tr) * get_q(col, &q_tr);
    }

    let outp = run_idct(&raster);

    let mut outp2 = [0; 64];
    run_idct_old(test_data, test_q, &mut outp2, false);

    assert_eq!(*outp.get_block(), outp2);
}

/// test with a simple block to catch obvious mistakes
#[test]
pub fn test_idct_with_simple_block() {
    let mut test_data = AlignedBlock::default();
    let mut test_q = [1u16; 64];

    test_q[0] = 2;
    test_data.set_coefficient(0, 1000);
    test_data.set_coefficient(1, -1000);

    test_idct(&test_data, &test_q);
}

/// test with random permutations to verify that the current implementation matches the legacy
/// implemenation from the original scalar C++ code
#[test]
pub fn test_idct_with_random_blocks() {
    use rand::Rng;

    let mut rng = crate::helpers::get_rand_from_seed([0u8; 32]);
    let mut test_data = AlignedBlock::default();
    let mut test_q = [0u16; 64];

    for _ in 0..16 {
        for i in 0..64 {
            test_data.get_block_mut()[i] = rng.gen_range(i16::MIN..=i16::MAX);
            test_q[i] = rng.gen_range(0..=u8::MAX as u16);
        }

        test_idct(&test_data, &test_q);
    }
}

#[cfg(any(test, feature = "micro_benchmark"))]
#[inline(never)]
/// benchmark for the coefficient writing code
pub fn benchmark_idct() -> Box<dyn FnMut()> {
    // make some non-trivial data
    let mut block = [i32x8::ZERO; 8];
    for i in 0..8 {
        block[i] = i32x8::from([1, 2, 3, 4, 5, 6, 7, 8]) * (i as i32 + 1);
    }

    Box::new(move || {
        use std::hint::black_box;

        black_box(run_idct(&block));
    })
}

#[test]
fn test_benchmark_idct() {
    let mut f = benchmark_idct();
    for _i in 0..100 {
        f();
    }
}


================================================
FILE: lib/src/structs/lepton_decoder.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::cmp;
use std::io::Read;

use bytemuck::cast_mut;
use default_boxed::DefaultBoxed;
use wide::i32x8;

use crate::Result;
use crate::consts::UNZIGZAG_49_TR;
use crate::enabled_features::EnabledFeatures;
use crate::helpers::u16_bit_length;
use crate::jpeg::block_based_image::{AlignedBlock, BlockBasedImage};
use crate::jpeg::jpeg_header::JpegHeader;
use crate::jpeg::row_spec::RowSpec;
use crate::jpeg::truncate_components::*;
use crate::lepton_error::{AddContext, ExitCode, err_exit_code};
use crate::metrics::Metrics;
use crate::structs::block_context::{BlockContext, NeighborData};
use crate::structs::model::{Model, ModelPerColor};
use crate::structs::neighbor_summary::NeighborSummary;
use crate::structs::probability_tables::ProbabilityTables;
use crate::structs::quantization_tables::QuantizationTables;
use crate::structs::vpx_bool_reader::VPXBoolReader;

/// reads stream from reader and populates image_data with the decoded data
/// the row_callback is called each time a full MCU row is decoded. This allows
/// the caller to process rows as they are decoded instead of waiting for the
/// entire image to be decoded.
#[inline(never)] // don't inline so that the profiler can get proper data
pub fn lepton_decode_row_range<R: Read, ROW: FnMut(&RowSpec, &[BlockBasedImage]) -> Result<()>>(
    qt: &[QuantizationTables],
    jpeg_header: &JpegHeader,
    trunc: &TruncateComponents,
    reader: &mut R,
    min_y: u32,
    max_y: u32,
    is_last_thread: bool,
    full_file_compression: bool,
    features: &EnabledFeatures,
    mut row_callback: ROW,
) -> Result<(Metrics, Vec<BlockBasedImage>)> {
    let component_size_in_blocks = trunc.get_component_sizes_in_blocks();
    let max_coded_heights = trunc.get_max_coded_heights();

    let mut image_data = Vec::new();
    for i in 0..jpeg_header.cmpc {
        image_data.push(BlockBasedImage::new(
            &jpeg_header,
            i,
            min_y,
            if is_last_thread {
                // if this is the last thread, then the image should extend all the way to the bottom
                jpeg_header.cmp_info[0].bcv
            } else {
                max_y
            },
        )?);
    }

    let mut is_top_row = Vec::new();
    let mut neighbor_summary_cache = Vec::new();

    // Init helper structures
    for i in 0..image_data.len() {
        is_top_row.push(true);

        let num_non_zeros_length = (image_data[i].get_block_width() << 1) as usize;

        let mut num_non_zero_list = Vec::new();
        num_non_zero_list.resize(num_non_zeros_length, NeighborSummary::default());

        neighbor_summary_cache.push(num_non_zero_list);
    }

    let mut model = Model::default_boxed();
    let mut bool_reader = VPXBoolReader::new(reader)?;

    let mut decode_index = 0;

    loop {
        let cur_row = RowSpec::get_row_spec_from_index(
            decode_index,
            &image_data[..],
            trunc.mcu_count_vertical,
            &max_coded_heights,
        );
        decode_index += 1;

        if cur_row.done {
            break;
        }

        if cur_row.luma_y >= max_y && !(is_last_thread && full_file_compression) {
            break;
        }

        if cur_row.skip {
            continue;
        }

        if cur_row.luma_y < min_y {
            continue;
        }

        let left_model;
        let middle_model;

        let component = cur_row.component;
        if is_top_row[component] {
            is_top_row[component] = false;

            left_model = &super::probability_tables::NO_NEIGHBORS;
            middle_model = &super::probability_tables::LEFT_ONLY;
        } else {
            left_model = &super::probability_tables::TOP_ONLY;
            middle_model = &super::probability_tables::ALL;
        }

        decode_row_wrapper(
            &mut model,
            &mut bool_reader,
            left_model,
            middle_model,
            ProbabilityTables::get_color_index(component),
            &mut image_data[component],
            &qt[component],
            &mut neighbor_summary_cache[component],
            cur_row.curr_y,
            component_size_in_blocks[component],
            features,
        )
        .context()?;

        if cur_row.last_row_to_complete_mcu {
            row_callback(&cur_row, &image_data[..]).context()?;
        }
    }
    Ok((bool_reader.drain_stats(), image_data))
}

#[inline(never)] // don't inline so that the profiler can get proper data
fn decode_row_wrapper<R: Read>(
    model: &mut Model,
    bool_reader: &mut VPXBoolReader<R>,
    left_model: &ProbabilityTables,
    middle_model: &ProbabilityTables,
    color_index: usize,
    image_data: &mut BlockBasedImage,
    qt: &QuantizationTables,
    neighbor_summary_cache: &mut [NeighborSummary],
    curr_y: u32,
    component_size_in_blocks: u32,
    features: &EnabledFeatures,
) -> Result<()> {
    let mut block_context = BlockContext::off_y(curr_y, image_data);

    let block_width = image_data.get_block_width();

    for jpeg_x in 0..block_width {
        let pt = if jpeg_x == 0 {
            left_model
        } else {
            middle_model
        };

        if pt.is_all_present() {
            parse_token::<R, true>(
                model,
                bool_reader,
                image_data,
                &block_context,
                neighbor_summary_cache,
                qt,
                pt,
                color_index,
                features,
            )
            .context()?;
        } else {
            parse_token::<R, false>(
                model,
                bool_reader,
                image_data,
                &block_context,
                neighbor_summary_cache,
                qt,
                pt,
                color_index,
                features,
            )
            .context()?;
        }

        let offset = block_context.next();

        if offset >= component_size_in_blocks {
            return Ok(()); // no sure if this is an error
        }
    }

    Ok(())
}

#[inline(never)] // don't inline so that the profiler can get proper data
fn parse_token<R: Read, const ALL_PRESENT: bool>(
    model: &mut Model,
    bool_reader: &mut VPXBoolReader<R>,
    image_data: &mut BlockBasedImage,
    context: &BlockContext,
    neighbor_summary_cache: &mut [NeighborSummary],
    qt: &QuantizationTables,
    pt: &ProbabilityTables,
    color_index: usize,
    features: &EnabledFeatures,
) -> Result<()> {
    debug_assert!(pt.is_all_present() == ALL_PRESENT);

    let neighbors =
        context.get_neighbor_data::<ALL_PRESENT>(image_data, neighbor_summary_cache, pt);

    let (output, ns) = read_coefficient_block::<ALL_PRESENT, R>(
        pt,
        color_index,
        &neighbors,
        model,
        bool_reader,
        qt,
        features,
    )?;

    context.set_neighbor_summary_here(neighbor_summary_cache, ns);

    image_data.append_block(output);

    Ok(())
}

/// Reads the 8x8 coefficient block from the bit reader, taking into account the neighboring
/// blocks, probability tables and model.
///
/// This function is designed to be independently callable without needing to know the context,
/// image data, etc so it can be extensively unit tested.
pub fn read_coefficient_block<const ALL_PRESENT: bool, R: Read>(
    pt: &ProbabilityTables,
    color_index: usize,
    neighbor_data: &NeighborData,
    model: &mut Model,
    bool_reader: &mut VPXBoolReader<R>,
    qt: &QuantizationTables,
    features: &EnabledFeatures,
) -> Result<(AlignedBlock, NeighborSummary)> {
    let model_per_color = model.get_per_color(color_index);

    // First we read the 49 inner coefficients

    // calculate the predictor context bin based on the neighbors
    let num_non_zeros_7x7_context_bin =
        pt.calc_num_non_zeros_7x7_context_bin::<ALL_PRESENT>(neighbor_data);

    // read how many of these are non-zero, which is used both
    // to terminate the loop early and as a predictor for the model
    let num_non_zeros_7x7 =
        model_per_color.read_non_zero_7x7_count(bool_reader, num_non_zeros_7x7_context_bin)?;

    if num_non_zeros_7x7 > 49 {
        // most likely a stream or model synchronization error
        return err_exit_code(ExitCode::StreamInconsistent, "numNonzeros7x7 > 49");
    }

    let mut output = AlignedBlock::default();
    let mut raster = [i32x8::ZERO; 8];
    let raster_col: &mut [i32; 64] = cast_mut(&mut raster);

    // these are used as predictors for the number of non-zero edge coefficients
    // do math in 32 bits since this is faster on most platforms
    let mut eob_x: u32 = 0;
    let mut eob_y: u32 = 0;

    let mut num_non_zeros_7x7_remaining = num_non_zeros_7x7 as usize;

    if num_non_zeros_7x7_remaining > 0 {
        let best_priors = pt.calc_coefficient_context_7x7_aavg_block::<ALL_PRESENT>(
            neighbor_data.left,
            neighbor_data.above,
            neighbor_data.above_left,
        );

        // calculate the bin we are using for the number of non-zeros
        let mut num_non_zeros_bin =
            ProbabilityTables::num_non_zeros_to_bin_7x7(num_non_zeros_7x7_remaining);

        // now loop through the coefficients in zigzag, terminating once we hit the number of non-zeros
        for (zig49, &coord_tr) in UNZIGZAG_49_TR.iter().enumerate() {
            let best_prior_bit_length = u16_bit_length(best_priors[coord_tr as usize]);

            let coef = model_per_color.read_coef(
                bool_reader,
                zig49,
                num_non_zeros_bin,
                best_prior_bit_length as usize,
            )?;

            if coef != 0 {
                // here we calculate the furthest x and y coordinates that have non-zero coefficients
                // which is later used as a predictor for the number of edge coefficients
                let by = u32::from(coord_tr) & 7;
                let bx = u32::from(coord_tr) >> 3;

                debug_assert!(bx > 0 && by > 0, "this does the DC and the lower 7x7 AC");

                eob_x = cmp::max(eob_x, bx);
                eob_y = cmp::max(eob_y, by);

                output.set_coefficient(coord_tr as usize, coef);
                raster_col[coord_tr as usize] = i32::from(coef)
                    * i32::from(qt.get_quantization_table_transposed()[coord_tr as usize]);

                num_non_zeros_7x7_remaining -= 1;
                if num_non_zeros_7x7_remaining == 0 {
                    break;
                }

                // update the bin since we've changed the number of non-zeros
                num_non_zeros_bin =
                    ProbabilityTables::num_non_zeros_to_bin_7x7(num_non_zeros_7x7_remaining);
            }
        }
    }

    if num_non_zeros_7x7_remaining > 0 {
        return err_exit_code(
            ExitCode::StreamInconsistent,
            "not enough nonzeros in 7x7 block",
        );
    }

    // step 2, read the edge coefficients
    // Here we produce the first part of edge DCT coefficients predictions for neighborhood blocks
    // and build transposed raster of dequantized DCT coefficients with 0 in DC
    let (horiz_pred, vert_pred) = decode_edge::<R, ALL_PRESENT>(
        neighbor_data,
        model_per_color,
        bool_reader,
        &mut output,
        qt,
        pt,
        num_non_zeros_7x7,
        &mut raster,
        eob_x as u8,
        eob_y as u8,
    )?;

    // step 3, read the DC coefficient (0,0 of the block)
    let q0 = qt.get_quantization_table()[0] as i32;
    let predicted_dc = pt.adv_predict_dc_pix::<ALL_PRESENT>(&raster, q0, &neighbor_data, features);

    let coef = model.read_dc(
        bool_reader,
        color_index,
        predicted_dc.uncertainty,
        predicted_dc.uncertainty2,
    )?;

    output.set_dc(ProbabilityTables::adv_predict_or_unpredict_dc(
        coef,
        true,
        predicted_dc.predicted_dc,
    ) as i16);

    // neighbor summary is used as a predictor for the next block
    let neighbor_summary = NeighborSummary::new(
        predicted_dc.next_edge_pixels_h,
        predicted_dc.next_edge_pixels_v,
        output.get_dc() as i32 * q0,
        num_non_zeros_7x7,
        horiz_pred,
        vert_pred,
    );

    Ok((output, neighbor_summary))
}

//#[inline(never)] // don't inline so that the profiler can get proper data
fn decode_edge<R: Read, const ALL_PRESENT: bool>(
    neighbor_data: &NeighborData,
    model_per_color: &mut ModelPerColor,
    bool_reader: &mut VPXBoolReader<R>,
    here_mut: &mut AlignedBlock,
    qt: &QuantizationTables,
    pt: &ProbabilityTables,
    num_non_zeros_7x7: u8,
    raster: &mut [i32x8; 8],
    eob_x: u8,
    eob_y: u8,
) -> Result<(i32x8, i32x8)> {
    let num_non_zeros_bin = (num_non_zeros_7x7 + 3) / 7;

    // get predictors for edge coefficients of the current block
    let (curr_horiz_pred, curr_vert_pred) =
        ProbabilityTables::predict_current_edges(neighbor_data, raster);

    decode_one_edge::<R, ALL_PRESENT, true>(
        model_per_color,
        bool_reader,
        &curr_horiz_pred.to_array(),
        here_mut,
        qt,
        pt,
        num_non_zeros_bin,
        eob_x,
        cast_mut(raster),
    )?;
    decode_one_edge::<R, ALL_PRESENT, false>(
        model_per_color,
        bool_reader,
        &curr_vert_pred.to_array(),
        here_mut,
        qt,
        pt,
        num_non_zeros_bin,
        eob_y,
        cast_mut(raster),
    )?;

    // prepare predictors for edge coefficients of the blocks below and to the right of current one
    let (next_horiz_pred, next_vert_pred) = ProbabilityTables::predict_next_edges(raster);

    Ok((next_horiz_pred, next_vert_pred))
}

fn decode_one_edge<R: Read, const ALL_PRESENT: bool, const HORIZONTAL: bool>(
    model_per_color: &mut ModelPerColor,
    bool_reader: &mut VPXBoolReader<R>,
    pred: &[i32; 8],
    here_mut: &mut AlignedBlock,
    qt: &QuantizationTables,
    pt: &ProbabilityTables,
    num_non_zeros_bin: u8,
    est_eob: u8,
    raster: &mut [i32; 64],
) -> Result<()> {
    let mut num_non_zeros_edge = model_per_color
        .read_non_zero_edge_count::<R, HORIZONTAL>(bool_reader, est_eob, num_non_zeros_bin)
        .context()?;

    let delta;
    let mut zig15offset;

    if HORIZONTAL {
        delta = 8;
        zig15offset = 0;
    } else {
        delta = 1;
        zig15offset = 7;
    }

    let mut coord_tr = delta;

    for _lane in 0..7 {
        if num_non_zeros_edge == 0 {
            break;
        }

        let best_prior =
            pt.calc_coefficient_context8_lak::<ALL_PRESENT, HORIZONTAL>(qt, coord_tr, pred)?;

        let coef = model_per_color.read_edge_coefficient(
            bool_reader,
            qt,
            zig15offset,
            num_non_zeros_edge,
            best_prior,
        )?;

        if coef != 0 {
            num_non_zeros_edge -= 1;
            here_mut.set_coefficient(coord_tr, coef);
            raster[coord_tr] =
                i32::from(coef) * i32::from(qt.get_quantization_table_transposed()[coord_tr]);
        }

        coord_tr += delta;
        zig15offset += 1;
    }

    if num_non_zeros_edge != 0 {
        return err_exit_code(ExitCode::StreamInconsistent, "StreamInconsistent");
    }

    Ok(())
}


================================================
FILE: lib/src/structs/lepton_encoder.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::cmp;
use std::io::Write;

use bytemuck::cast;
use default_boxed::DefaultBoxed;
use wide::i32x8;

use crate::Result;
use crate::consts::UNZIGZAG_49_TR;
use crate::enabled_features::EnabledFeatures;
use crate::helpers::*;
use crate::jpeg::block_based_image::{AlignedBlock, BlockBasedImage};
use crate::jpeg::row_spec::RowSpec;
use crate::jpeg::truncate_components::*;
use crate::lepton_error::{AddContext, ExitCode, err_exit_code};
use crate::metrics::Metrics;
use crate::structs::block_context::{BlockContext, NeighborData};
use crate::structs::model::{Model, ModelPerColor};
use crate::structs::neighbor_summary::NeighborSummary;
use crate::structs::probability_tables::ProbabilityTables;
use crate::structs::quantization_tables::QuantizationTables;
use crate::structs::vpx_bool_writer::VPXBoolWriter;

#[inline(never)] // don't inline so that the profiler can get proper data
pub fn lepton_encode_row_range<W: Write>(
    quantization_tables: &[QuantizationTables],
    image_data: &[BlockBasedImage],
    writer: &mut W,
    _thread_id: i32,
    colldata: &TruncateComponents,
    min_y: u32,
    max_y: u32,
    is_last_thread: bool,
    full_file_compression: bool,
    features: &EnabledFeatures,
) -> Result<Metrics> {
    let mut model = Model::default_boxed();
    let mut bool_writer = VPXBoolWriter::new(writer)?;

    let mut is_top_row = Vec::new();
    let mut neighbor_summary_cache = Vec::new();

    // Init helper structures
    for i in 0..image_data.len() {
        is_top_row.push(true);

        let num_non_zeros_length = (image_data[i].get_block_width() << 1) as usize;

        let mut neighbor_summary_component = Vec::new();
        neighbor_summary_component.resize(num_non_zeros_length, NeighborSummary::default());

        neighbor_summary_cache.push(neighbor_summary_component);
    }

    let component_size_in_blocks = colldata.get_component_sizes_in_blocks();
    let max_coded_heights = colldata.get_max_coded_heights();

    let mut encode_index = 0;
    loop {
        let cur_row = RowSpec::get_row_spec_from_index(
            encode_index,
            image_data,
            colldata.mcu_count_vertical,
            &max_coded_heights,
        );
        encode_index += 1;

        if cur_row.done {
            break;
        }

        if cur_row.luma_y >= max_y && !(is_last_thread && full_file_compression) {
            break;
        }

        if cur_row.skip {
            continue;
        }

        if cur_row.luma_y < min_y {
            continue;
        }

        // Advance to next row to cache expended block data for current row. Should be called before getting block context.
        let component = cur_row.component;

        let left_model;
        let middle_model;

        if is_top_row[component] {
            is_top_row[component] = false;

            left_model = &super::probability_tables::NO_NEIGHBORS;
            middle_model = &super::probability_tables::LEFT_ONLY;
        } else {
            left_model = &super::probability_tables::TOP_ONLY;
            middle_model = &super::probability_tables::ALL;
        }

        process_row(
            &mut model,
            &mut bool_writer,
            left_model,
            middle_model,
            ProbabilityTables::get_color_index(component),
            &image_data[component],
            &quantization_tables[component],
            &mut neighbor_summary_cache[component][..],
            cur_row.curr_y,
            component_size_in_blocks[component],
            features,
        )
        .context()?;

        bool_writer.flush_non_final_data().context()?;
    }

    if is_last_thread && full_file_compression {
        let test = RowSpec::get_row_spec_from_index(
            encode_index,
            image_data,
            colldata.mcu_count_vertical,
            &max_coded_heights,
        );

        assert!(
            test.skip && test.done,
            "Row spec test: cmp {0} luma {1} item {2} skip {3} done {4}",
            test.component,
            test.luma_y,
            test.curr_y,
            test.skip,
            test.done
        );
    }

    bool_writer.finish().context()?;

    Ok(bool_writer.drain_stats())
}

#[inline(never)] // don't inline so that the profiler can get proper data
fn process_row<W: Write>(
    model: &mut Model,
    bool_writer: &mut VPXBoolWriter<W>,
    left_model: &ProbabilityTables,
    middle_model: &ProbabilityTables,
    color_index: usize,
    image_data: &BlockBasedImage,
    qt: &QuantizationTables,
    neighbor_summary_cache: &mut [NeighborSummary],
    curr_y: u32,
    component_size_in_block: u32,
    features: &EnabledFeatures,
) -> Result<()> {
    let mut block_context = BlockContext::off_y(curr_y, image_data);
    let block_width = image_data.get_block_width();

    for jpeg_x in 0..block_width {
        let pt: &ProbabilityTables = if jpeg_x == 0 {
            left_model
        } else {
            middle_model
        };

        // shortcut all the checks for the presence of left/right components by passing a constant generic parameter
        if pt.is_all_present() {
            serialize_tokens::<W, true>(
                &block_context,
                qt,
                pt,
                model,
                color_index,
                image_data,
                neighbor_summary_cache,
                bool_writer,
                features,
            )
            .context()?;
        } else {
            serialize_tokens::<W, false>(
                &block_context,
                qt,
                pt,
                model,
                color_index,
                image_data,
                neighbor_summary_cache,
                bool_writer,
                features,
            )
            .context()?;
        }

        let offset = block_context.next();

        if offset >= component_size_in_block {
            return Ok(());
        }
    }

    Ok(())
}

#[inline(never)] // don't inline so that the profiler can get proper data
fn serialize_tokens<W: Write, const ALL_PRESENT: bool>(
    context: &BlockContext,
    qt: &QuantizationTables,
    pt: &ProbabilityTables,
    model: &mut Model,
    color_index: usize,
    image_data: &BlockBasedImage,
    neighbor_summary_cache: &mut [NeighborSummary],
    bool_writer: &mut VPXBoolWriter<W>,
    features: &EnabledFeatures,
) -> Result<()> {
    debug_assert!(ALL_PRESENT == pt.is_all_present());

    let block = context.here(image_data);

    let neighbors =
        context.get_neighbor_data::<ALL_PRESENT>(image_data, neighbor_summary_cache, pt);

    #[cfg(feature = "detailed_tracing")]
    log::trace!(
        "block {0}:{1:x}",
        context.get_here_index(),
        block.get_hash()
    );

    let ns = write_coefficient_block::<ALL_PRESENT, W>(
        pt,
        color_index,
        &neighbors,
        block,
        model,
        bool_writer,
        qt,
        features,
    )?;

    context.set_neighbor_summary_here(neighbor_summary_cache, ns);

    Ok(())
}

/// Writes the 8x8 coefficient block to the bit writer, taking into account the neighboring
/// blocks, probability tables and model.
///
/// This function is designed to be independently callable without needing to know the context,
/// image data, etc so it can be extensively unit tested.
pub fn write_coefficient_block<const ALL_PRESENT: bool, W: Write>(
    pt: &ProbabilityTables,
    color_index: usize,
    neighbors_data: &NeighborData,
    here_tr: &AlignedBlock,
    model: &mut Model,
    bool_writer: &mut VPXBoolWriter<W>,
    qt: &QuantizationTables,
    features: &EnabledFeatures,
) -> Result<NeighborSummary> {
    let model_per_color = model.get_per_color(color_index);

    // First we encode the 49 inner coefficients

    // calculate the predictor context bin based on the neighbors
    let num_non_zeros_7x7_context_bin =
        pt.calc_num_non_zeros_7x7_context_bin::<ALL_PRESENT>(neighbors_data);

    // store how many of these coefficients are non-zero, which is used both
    // to terminate the loop early and as a predictor for the model
    let num_non_zeros_7x7 = here_tr.get_count_of_non_zeros_7x7();

    model_per_color
        .write_non_zero_7x7_count(
            bool_writer,
            num_non_zeros_7x7_context_bin,
            num_non_zeros_7x7,
        )
        .context()?;

    // these are used as predictors for the number of non-zero edge coefficients
    // do math in 32 bits since this is faster on most modern platforms
    let mut eob_x: u32 = 0;
    let mut eob_y: u32 = 0;

    let mut num_non_zeros_7x7_remaining = num_non_zeros_7x7 as usize;

    if num_non_zeros_7x7_remaining > 0 {
        let best_priors = pt.calc_coefficient_context_7x7_aavg_block::<ALL_PRESENT>(
            neighbors_data.left,
            neighbors_data.above,
            neighbors_data.above_left,
        );
        // calculate the bin we are using for the number of non-zeros
        let mut num_non_zeros_remaining_bin =
            ProbabilityTables::num_non_zeros_to_bin_7x7(num_non_zeros_7x7_remaining);

        // now loop through the coefficients in zigzag, terminating once we hit the number of non-zeros
        for (zig49, &coord_tr) in UNZIGZAG_49_TR.iter().enumerate() {
            let best_prior_bit_length = u16_bit_length(best_priors[coord_tr as usize]);

            let coef = here_tr.get_coefficient(coord_tr as usize);

            model_per_color
                .write_coef(
                    bool_writer,
                    coef,
                    zig49,
                    num_non_zeros_remaining_bin,
                    best_prior_bit_length as usize,
                )
                .context()?;

            if coef != 0 {
                // here we calculate the furthest x and y coordinates that have non-zero coefficients
                // which is later used as a predictor for the number of edge coefficients
                let by = u32::from(coord_tr) & 7;
                let bx = u32::from(coord_tr) >> 3;

                debug_assert!(bx > 0 && by > 0, "this does the DC and the lower 7x7 AC");

                eob_x = cmp::max(eob_x, bx);
                eob_y = cmp::max(eob_y, by);

                num_non_zeros_7x7_remaining -= 1;
                if num_non_zeros_7x7_remaining == 0 {
                    break;
                }

                // update the bin since the number of non-zeros has changed
                num_non_zeros_remaining_bin =
                    ProbabilityTables::num_non_zeros_to_bin_7x7(num_non_zeros_7x7_remaining);
            }
        }
    }

    // Next step is the edge coefficients.
    // Here we produce the first part of edge DCT coefficients predictions for neighborhood blocks
    // and transposed raster of dequantized DCT coefficients with 0 in DC
    let (raster, horiz_pred, vert_pred) = encode_edge::<W, ALL_PRESENT>(
        neighbors_data,
        &here_tr,
        model_per_color,
        bool_writer,
        qt,
        pt,
        num_non_zeros_7x7,
        eob_x as u8,
        eob_y as u8,
    )
    .context()?;

    // finally the DC coefficient (at 0,0)
    let q0 = qt.get_quantization_table()[0] as i32;
    let predicted_val =
        pt.adv_predict_dc_pix::<ALL_PRESENT>(&raster, q0, &neighbors_data, features);

    let avg_predicted_dc = ProbabilityTables::adv_predict_or_unpredict_dc(
        here_tr.get_dc(),
        false,
        predicted_val.predicted_dc,
    );

    if here_tr.get_dc() as i32
        != ProbabilityTables::adv_predict_or_unpredict_dc(
            avg_predicted_dc as i16,
            true,
            predicted_val.predicted_dc,
        )
    {
        return err_exit_code(ExitCode::CoefficientOutOfRange, "BlockDC mismatch");
    }

    model
        .write_dc(
            bool_writer,
            color_index,
            avg_predicted_dc as i16,
            predicted_val.uncertainty,
            predicted_val.uncertainty2,
        )
        .context()?;

    // neighbor summary is used as a predictor for the next block
    let neighbor_summary = NeighborSummary::new(
        predicted_val.next_edge_pixels_h,
        predicted_val.next_edge_pixels_v,
        here_tr.get_dc() as i32 * q0,
        num_non_zeros_7x7,
        horiz_pred,
        vert_pred,
    );

    Ok(neighbor_summary)
}

#[inline(never)] // don't inline so that the profiler can get proper data
fn encode_edge<W: Write, const ALL_PRESENT: bool>(
    neighbors_data: &NeighborData,
    here_tr: &AlignedBlock,
    model_per_color: &mut ModelPerColor,
    bool_writer: &mut VPXBoolWriter<W>,
    qt: &QuantizationTables,
    pt: &ProbabilityTables,
    num_non_zeros_7x7: u8,
    eob_x: u8,
    eob_y: u8,
) -> Result<([i32x8; 8], i32x8, i32x8)> {
    let q_tr = qt.get_quantization_table_transposed();

    let mut raster_co = [0i32; 64];
    for i in 1..64 {
        raster_co[i] = i32::from(here_tr.get_coefficient(i)) * i32::from(q_tr[i]);
    }

    let raster: [i32x8; 8] = cast(raster_co);

    // get predictors for edge coefficients of the current block
    let (curr_horiz_pred, curr_vert_pred) =
        ProbabilityTables::predict_current_edges(neighbors_data, &raster);

    let num_non_zeros_bin = (num_non_zeros_7x7 + 3) / 7;

    encode_one_edge::<W, ALL_PRESENT, true>(
        here_tr,
        model_per_color,
        bool_writer,
        &curr_horiz_pred.to_array(),
        qt,
        pt,
        num_non_zeros_bin,
        eob_x,
    )
    .context()?;

    encode_one_edge::<W, ALL_PRESENT, false>(
        here_tr,
        model_per_color,
        bool_writer,
        &curr_vert_pred.to_array(),
        qt,
        pt,
        num_non_zeros_bin,
        eob_y,
    )
    .context()?;

    // prepare predictors for edge coefficients of the blocks below and to the right of current one
    let (next_horiz_pred, next_vert_pred) = ProbabilityTables::predict_next_edges(&raster);

    Ok((raster, next_horiz_pred, next_vert_pred))
}

fn count_non_zero(v: i16) -> u8 {
    if v == 0 { 0 } else { 1 }
}

fn encode_one_edge<W: Write, const ALL_PRESENT: bool, const HORIZONTAL: bool>(
    block: &AlignedBlock,
    model_per_color: &mut ModelPerColor,
    bool_writer: &mut VPXBoolWriter<W>,
    pred: &[i32; 8],
    qt: &QuantizationTables,
    pt: &ProbabilityTables,
    num_non_zeros_bin: u8,
    est_eob: u8,
) -> Result<()> {
    let mut num_non_zeros_edge;

    if !HORIZONTAL {
        num_non_zeros_edge = count_non_zero(block.get_coefficient(1))
            + count_non_zero(block.get_coefficient(2))
            + count_non_zero(block.get_coefficient(3))
            + count_non_zero(block.get_coefficient(4))
            + count_non_zero(block.get_coefficient(5))
            + count_non_zero(block.get_coefficient(6))
            + count_non_zero(block.get_coefficient(7));
    } else {
        num_non_zeros_edge = count_non_zero(block.get_coefficient(1 * 8))
            + count_non_zero(block.get_coefficient(2 * 8))
            + count_non_zero(block.get_coefficient(3 * 8))
            + count_non_zero(block.get_coefficient(4 * 8))
            + count_non_zero(block.get_coefficient(5 * 8))
            + count_non_zero(block.get_coefficient(6 * 8))
            + count_non_zero(block.get_coefficient(7 * 8));
    }

    model_per_color
        .write_non_zero_edge_count::<W, HORIZONTAL>(
            bool_writer,
            est_eob,
            num_non_zeros_bin,
            num_non_zeros_edge,
        )
        .context()?;

    let delta;
    let mut zig15offset;

    if HORIZONTAL {
        delta = 8;
        zig15offset = 0;
    } else {
        delta = 1;
        zig15offset = 7;
    }

    let mut coord_tr = delta;

    for _lane in 0..7 {
        if num_non_zeros_edge == 0 {
            break;
        }

        let best_prior =
            pt.calc_coefficient_context8_lak::<ALL_PRESENT, HORIZONTAL>(qt, coord_tr, pred)?;

        let coef = block.get_coefficient(coord_tr);

        model_per_color
            .write_edge_coefficient(
                bool_writer,
                qt,
                coef,
                zig15offset,
                num_non_zeros_edge,
                best_prior,
            )
            .context()?;

        if coef != 0 {
            num_non_zeros_edge -= 1;
        }

        coord_tr += delta;
        zig15offset += 1;
    }

    Ok(())
}

/// simplest case, all zeros. The goal of these test cases is go from simplest to most
/// complicated so if tests start failing, you have some idea of where to start looking.
#[test]
fn roundtrip_zeros() {
    let block = AlignedBlock::new([0; 64]);

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [1; 64],
        0x4154B63BDE6F2912,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

/// tests blocks with only DC coefficient set
#[test]
fn roundtrip_dc_only() {
    let mut block = AlignedBlock::new([0; 64]);
    block.set_dc(-100);

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [1; 64],
        0x2556719DE605BB41,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

/// tests blocks with only edge coefficients set
#[test]
fn roundtrip_edges_only() {
    let mut block = AlignedBlock::new([0; 64]);
    for i in 1..7 {
        block.set_coefficient(i, -100);
        block.set_coefficient(i * 8, 100);
    }

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [1; 64],
        0x91061AE0FBE7C626,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

/// tests blocks with only 7x7 coefficients set
#[test]
fn roundtrip_ac_only() {
    let mut block = AlignedBlock::new([0; 64]);
    for i in 0..64 {
        let x = i & 7;
        let y = i >> 3;

        if x > 0 && y > 0 {
            block.set_coefficient(i, (x * y) as i16);
        }
    }

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [1; 64],
        0x9F5637364D41FE11,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

#[test]
fn roundtrip_ones() {
    let block = AlignedBlock::new([1; 64]);

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [1; 64],
        0x6B2A9E7E1DA9A4B3,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

/// test large coefficients that could overflow unpredictably if there are changes to the
/// way the math operations are performed (for example overflow or bitness)
#[test]
fn roundtrip_large_coef() {
    // largest coefficient that doesn't cause a DC overflow
    let block = AlignedBlock::new([-1010; 64]);

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [1; 64],
        0x95CBDD4F7D7B72EB,
        &EnabledFeatures::compat_lepton_vector_read(),
    );

    // now test with maximum quantization table. In theory this is legal according
    // the JPEG format and there is no code preventing this from being attempted
    // by the encoder.

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [65535; 64],
        0xE514715BD531D80E,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

/// "random" set of blocks to ensure that all ranges of coefficients work properly
#[test]
fn roundtrip_random_seed() {
    use rand::Rng;

    // the 127 seed is a choice that doesn't overflow the DC coefficient
    // since the encoder is somewhat picky if the DC estimate overflows
    // it also has different behavior for 32 and 16 bit codepath
    let mut rng = crate::helpers::get_rand_from_seed([127; 32]);

    let arr = [0i16; 64];

    let left = AlignedBlock::new(arr.map(|_| rng.gen_range(-2047..=2047)));
    let above = AlignedBlock::new(arr.map(|_| rng.gen_range(-2047..=2047)));
    let here = AlignedBlock::new(arr.map(|_| rng.gen_range(-2047..=2047)));
    let above_left = AlignedBlock::new(arr.map(|_| rng.gen_range(-2047..=2047)));
    let qt = arr.map(|_| rng.gen_range(1u16..=65535));

    // using 32 bit math (test emulating both scalar and vector C++ code)
    let a = roundtrip_read_write_coefficients(
        &left,
        &above,
        &above_left,
        &here,
        qt,
        0x146C568A90EB0F14,
        &EnabledFeatures::compat_lepton_scalar_read(),
    );

    // using 16 bit math
    let b = roundtrip_read_write_coefficients(
        &left,
        &above,
        &above_left,
        &here,
        qt,
        0x12ECA3C71A29300C,
        &EnabledFeatures::compat_lepton_vector_read(),
    );

    assert!(a != b);
}

/// tests a pattern where all the coefficients are unique to make sure we don't mix up anything
#[test]
fn roundtrip_unique() {
    let mut arr = [0; 64];
    for i in 0..64 {
        arr[i] = i as i16;
    }

    let left = AlignedBlock::new(arr);
    let above = AlignedBlock::new(arr.map(|x| x + 64));
    let above_left = AlignedBlock::new(arr.map(|x| x + 128));
    let here = AlignedBlock::new(arr.map(|x| x + 256));

    roundtrip_read_write_coefficients(
        &left,
        &above,
        &above_left,
        &here,
        [1; 64],
        0x8FA72ED7E5961A1C,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

/// tests a pattern to check the non-zero counting
#[test]
fn roundtrip_non_zeros_counts() {
    let mut arr = [0; 64];

    // upper left corner is all 50, the rest is 0
    // this should result in 3 or 4 non-zero coefficients
    // (depending on vertical/horizontal, make this non-symetrical to catch mixups)
    for i in 0..64 {
        let x = i & 7;
        let y = i >> 3;

        arr[i] = if x < 4 && y < 3 { 50 } else { 0 };
    }

    let block = AlignedBlock::new(arr);

    roundtrip_read_write_coefficients(
        &block,
        &block,
        &block,
        &block,
        [1; 64],
        0x6C93F3EF5495440B,
        &EnabledFeatures::compat_lepton_vector_read(),
    );
}

/// randomizes the branches of the model so that we don't start with a
/// state where all the branches are in the same state. This is important
/// to catch any misaligment in the model state between reading and writing.
#[cfg(test)]
fn make_random_model() -> Box<Model> {
    let mut model = Model::default_boxed();

    use rand::Rng;

    let mut rng = crate::helpers::get_rand_from_seed([2u8; 32]);

    model.walk(|x| {
        x.set_count(rng.gen_range(0x01..=0xff) * 256 + rng.gen_range(0x01..=0xff));
    });
    model
}

/// tests the roundtrip of reading and writing coefficients
///
/// The tests are done with a seeded random model so that the tests are deterministic.
///
/// In addition, we check to make that everything ran as expected by comparing the
/// hash of the output to a verified output. This verified output is generated by
/// hashing the output plus the new state of the model.
#[cfg(test)]
fn roundtrip_read_write_coefficients(
    left: &AlignedBlock,
    above: &AlignedBlock,
    above_left: &AlignedBlock,
    here: &AlignedBlock,
    qt: [u16; 64],
    verified_output: u64,
    features: &EnabledFeatures,
) -> u64 {
    use std::hash::Hasher;
    use std::io::{Cursor, Read};

    // use the Sip hasher directly since that's guaranteed not to change implementation vs the default hasher
    use siphasher::sip::SipHasher13;

    use crate::jpeg::block_based_image::EMPTY_BLOCK;
    use crate::structs::lepton_decoder::read_coefficient_block;
    use crate::structs::neighbor_summary::NEIGHBOR_DATA_EMPTY;
    use crate::structs::vpx_bool_reader::VPXBoolReader;

    let mut write_model = make_random_model();

    let mut buffer = Vec::new();

    let mut bool_writer = VPXBoolWriter::new(&mut buffer).unwrap();

    let qt = QuantizationTables::new_from_table(&qt);

    /// This is a helper function to avoid having to duplicate the code for the different cases.
    fn call_write_coefficient_block<W: Write>(
        left: Option<(&AlignedBlock, &NeighborSummary)>,
        above: Option<(&AlignedBlock, &NeighborSummary)>,
        above_left: Option<&AlignedBlock>,
        color_index: usize,
        here: &AlignedBlock,
        write_model: &mut Model,
        bool_writer: &mut VPXBoolWriter<W>,
        qt: &QuantizationTables,
        features: &EnabledFeatures,
    ) -> NeighborSummary {
        let pt = ProbabilityTables::new(left.is_some(), above.is_some());
        let n = NeighborData {
            above: &above.map(|x| x.0).unwrap_or(&EMPTY_BLOCK).transpose(),
            left: &left.map(|x| x.0).unwrap_or(&EMPTY_BLOCK).transpose(),
            above_left: &above_left.unwrap_or(&EMPTY_BLOCK).transpose(),
            neighbor_context_above: above.map(|x| x.1).unwrap_or(&NEIGHBOR_DATA_EMPTY),
            neighbor_context_left: left.map(|x| x.1).unwrap_or(&NEIGHBOR_DATA_EMPTY),
        };

        let here_tr = here.transpose();

        // call the right version depending on if we have all neighbors or not
        if left.is_some() && above.is_some() {
            write_coefficient_block::<true, _>(
                &pt,
                color_index,
                &n,
                &here_tr,
                write_model,
                bool_writer,
                qt,
                features,
            )
            .unwrap()
        } else {
            write_coefficient_block::<false, _>(
                &pt,
                color_index,
                &n,
                &here_tr,
                write_model,
                bool_writer,
                qt,
                features,
            )
            .unwrap()
        }
    }

    /// This is a helper function to avoid having to duplicate the code for the different cases.
    fn call_read_coefficient_block<R: Read>(
        left: Option<(&AlignedBlock, &NeighborSummary)>,
        above: Option<(&AlignedBlock, &NeighborSummary)>,
        above_left: Option<&AlignedBlock>,
        color_index: usize,
        read_model: &mut Model,
        bool_reader: &mut VPXBoolReader<R>,
        qt: &QuantizationTables,
        features: &EnabledFeatures,
    ) -> (AlignedBlock, NeighborSummary) {
        let pt = ProbabilityTables::new(left.is_some(), above.is_some());
        let n = NeighborData {
            above: &above.map(|x| x.0).unwrap_or(&EMPTY_BLOCK).transpose(),
            left: &left.map(|x| x.0).unwrap_or(&EMPTY_BLOCK).transpose(),
            above_left: &above_left.unwrap_or(&EMPTY_BLOCK).transpose(),
            neighbor_context_above: above.map(|x| x.1).unwrap_or(&NEIGHBOR_DATA_EMPTY),
            neighbor_context_left: left.map(|x| x.1).unwrap_or(&NEIGHBOR_DATA_EMPTY),
        };

        // call the right version depending on if we have all neighbors or not
        let r = if left.is_some() && above.is_some() {
            read_coefficient_block::<true, _>(
                &pt,
                color_index,
                &n,
                read_model,
                bool_reader,
                qt,
                features,
            )
            .unwrap()
        } else {
            read_coefficient_block::<false, _>(
                &pt,
                color_index,
                &n,
                read_model,
                bool_reader,
                qt,
                features,
            )
            .unwrap()
        };

        (r.0.transpose(), r.1)
    }

    // overall idea here is to call write and read on all possible permutations of neighbors
    // the grid looks like this:
    //
    // [ above_left ] [ above ]
    // [ left       ] [ here  ]
    //
    // first: above_left (with no neighbors)

    let color_index = 0;

    let w_above_left_ns = call_write_coefficient_block(
        None,
        None,
        None,
        color_index,
        &above_left,
        &mut write_model,
        &mut bool_writer,
        &qt,
        &features,
    );

    // now above, with above_left as neighbor
    let w_above_ns = call_write_coefficient_block(
        Some((&above_left, &w_above_left_ns)),
        None,
        None,
        color_index,
        &above,
        &mut write_model,
        &mut bool_writer,
        &qt,
        &features,
    );

    // now left with above_left as neighbor
    let w_left_ns = call_write_coefficient_block(
        None,
        Some((&above_left, &w_above_left_ns)),
        None,
        color_index,
        &left,
        &mut write_model,
        &mut bool_writer,
        &qt,
        &features,
    );

    // now here with above and left as neighbors
    let w_here_ns = call_write_coefficient_block(
        Some((&left, &w_left_ns)),
        Some((&above, &w_above_ns)),
        Some(above_left),
        color_index,
        &here,
        &mut write_model,
        &mut bool_writer,
        &qt,
        &features,
    );

    bool_writer.finish().unwrap();

    // now re-read the model and make sure everything matches
    let mut read_model = make_random_model();
    let mut bool_reader = VPXBoolReader::new(Cursor::new(&buffer)).unwrap();

    let (r_above_left_block, r_above_left_ns) = call_read_coefficient_block(
        None,
        None,
        None,
        color_index,
        &mut read_model,
        &mut bool_reader,
        &qt,
        &features,
    );

    assert_eq!(
        r_above_left_block.get_block(),
        above_left.get_block(),
        "above_left"
    );
    assert_eq!(r_above_left_ns, w_above_left_ns, "above_left_ns");

    let (r_above_block, r_above_ns) = call_read_coefficient_block(
        Some((&r_above_left_block, &w_above_left_ns)),
        None,
        None,
        color_index,
        &mut read_model,
        &mut bool_reader,
        &qt,
        &features,
    );

    assert_eq!(r_above_block.get_block(), above.get_block(), "above");
    assert_eq!(r_above_ns, w_above_ns, "above_ns");

    let (r_left_block, r_left_ns) = call_read_coefficient_block(
        None,
        Some((&r_above_left_block, &r_above_left_ns)),
        None,
        color_index,
        &mut read_model,
        &mut bool_reader,
        &qt,
        &features,
    );

    assert_eq!(r_left_block.get_block(), left.get_block(), "left");
    assert_eq!(r_left_ns, w_left_ns, "left_ns");

    let (r_here, r_here_ns) = call_read_coefficient_block(
        Some((&r_left_block, &r_left_ns)),
        Some((&r_above_block, &r_above_ns)),
        Some(above_left),
        color_index,
        &mut read_model,
        &mut bool_reader,
        &qt,
        &features,
    );

    assert_eq!(r_here.get_block(), here.get_block(), "here");
    assert_eq!(r_here_ns, w_here_ns, "here_ns");

    assert_eq!(write_model.model_checksum(), read_model.model_checksum());

    let mut h = SipHasher13::new();
    h.write(&buffer);
    h.write_u64(write_model.model_checksum());
    let hash = h.finish();

    println!("0x{:x?},", hash);

    if verified_output != 0 {
        assert_eq!(
            verified_output, hash,
            "Hash mismatch. Unexpected change in model behavior/output format"
        );
    }

    hash
}

#[cfg(any(test, feature = "micro_benchmark"))]
#[inline(never)]
/// benchmark for the coefficient reading and writing code
pub fn benchmark_roundtrip_coefficient() -> Box<dyn FnMut()> {
    use crate::structs::lepton_decoder::read_coefficient_block;
    use crate::structs::vpx_bool_reader::VPXBoolReader;
    use std::{hint::black_box, io::Cursor};
    use wide::i16x8;

    fn make_block(i: &mut i16) -> AlignedBlock {
        let mut arr = [0; 64];
        for v in arr.iter_mut() {
            *v = *i;
            *i += 1;
        }
        AlignedBlock::new(arr)
    }

    let mut counter = 1;

    let mut write_model = Model::default_boxed();
    let mut read_model = Model::default_boxed();

    let qt = QuantizationTables::new_from_table(&[1; 64]);

    let a = make_block(&mut counter);
    let l = make_block(&mut counter);
    let al = make_block(&mut counter);

    let edge_pixels_h = i16x8::from([4; 8]);
    let edge_pixels_v = i16x8::from([5; 8]);
    let dc_deq = 6 * 1; // quantization table value is
    let num_non_zeros_7x7 = 49;
    let horiz_pred = i32x8::from([7; 8]);
    let vert_pred = i32x8::from([8; 8]);

    let na = NeighborSummary::new(
        edge_pixels_h,
        edge_pixels_v,
        dc_deq,
        num_non_zeros_7x7,
        horiz_pred,
        vert_pred,
    );
    let nl = NeighborSummary::new(
        edge_pixels_h,
        edge_pixels_v,
        dc_deq,
        num_non_zeros_7x7,
        horiz_pred,
        vert_pred,
    );

    let pt = ProbabilityTables::new(true, true);
    let here_tr = make_block(&mut counter);
    let features = EnabledFeatures::compat_lepton_vector_read();

    Box::new(move || {
        let n = NeighborData {
            above: &a,
            left: &l,
            above_left: &al,
            neighbor_context_above: &na,
            neighbor_context_left: &nl,
        };

        let mut buffer = Vec::with_capacity(100);
        let mut bool_writer = VPXBoolWriter::new(&mut buffer).unwrap();

        write_coefficient_block::<true, _>(
            &pt,
            0,
            &n,
            &here_tr,
            &mut write_model,
            &mut bool_writer,
            &qt,
            &features,
        )
        .unwrap();

        bool_writer.finish().unwrap();

        let mut bool_reader = VPXBoolReader::new(Cursor::new(&buffer)).unwrap();
        let (block_tr, summary) = read_coefficient_block::<true, _>(
            &pt,
            0,
            &n,
            &mut read_model,
            &mut bool_reader,
            &qt,
            &features,
        )
        .unwrap();

        black_box(summary);
        debug_assert_eq!(here_tr.get_block(), block_tr.get_block());
    })
}

#[test]
fn test_benchmark_roundtrip_coefficient() {
    let mut f = benchmark_roundtrip_coefficient();
    for _i in 0..100 {
        f();
    }
}


================================================
FILE: lib/src/structs/lepton_file_reader.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::collections::VecDeque;
use std::io::{BufRead, Cursor, Read, Write};
use std::mem;
use std::sync::mpsc::Sender;

use default_boxed::DefaultBoxed;
#[cfg(feature = "detailed_tracing")]
use log::info;
use log::warn;

use crate::enabled_features::EnabledFeatures;
use crate::jpeg::block_based_image::BlockBasedImage;
use crate::jpeg::jpeg_code;
use crate::jpeg::jpeg_header::{JpegHeader, ReconstructionInfo, RestartSegmentCodingInfo};
use crate::jpeg::jpeg_write::{JpegIncrementalWriter, jpeg_write_entire_scan};
use crate::lepton_error::{AddContext, ExitCode, Result, err_exit_code};
use crate::metrics::{CpuTimeMeasure, Metrics};
use crate::structs::lepton_decoder::lepton_decode_row_range;
use crate::structs::lepton_header::{FIXED_HEADER_SIZE, LeptonHeader};
use crate::structs::multiplexer::{
    MultiplexReadResult, MultiplexReader, MultiplexReaderState, multiplex_read,
};
use crate::structs::partial_buffer::PartialBuffer;
use crate::structs::quantization_tables::QuantizationTables;
use crate::structs::simple_threadpool::ThreadPoolHolder;
use crate::structs::thread_handoff::ThreadHandoff;
use crate::{LeptonThreadPool, consts::*};

/// Reads an entire lepton file and writes it out as a JPEG
///
/// # Parameters
///
/// - `reader`: A buffered reader from which the Lepton-encoded data is read.
/// - `writer`: A writer to which the decoded JPEG image is written.
/// - `enabled_features`: A set of toggles for enabling/disabling decoding features/restrictions.
/// - `thread_pool`: A reference to a thread pool used for parallel processing. Must be a static reference and
/// can point to `DEFAULT_THREAD_POOL`.
pub fn decode_lepton<R: BufRead, W: Write>(
    reader: &mut R,
    writer: &mut W,
    enabled_features: &EnabledFeatures,
    thread_pool: &dyn LeptonThreadPool,
) -> Result<Metrics> {
    let mut decoder =
        LeptonFileReader::new(enabled_features.clone(), ThreadPoolHolder::Dyn(thread_pool));

    loop {
        let buffer = reader.fill_buf().context()?;

        decoder
            .process_buffer(buffer, buffer.len() == 0, writer)
            .context()?;

        if buffer.len() == 0 {
            break;
        }

        let amt = buffer.len();
        reader.consume(amt);
    }

    return Ok(decoder.take_metrics());
}

/// this is a debug function only called by the utility EXE code
/// used to dump the contents of the file
#[allow(dead_code)]
pub fn decode_lepton_file_image<R: BufRead>(
    reader: &mut R,
    enabled_features: &EnabledFeatures,
    thread_pool: &dyn LeptonThreadPool,
) -> Result<(Box<LeptonHeader>, Vec<BlockBasedImage>)> {
    let mut lh = LeptonHeader::default_boxed();
    let mut enabled_features = enabled_features.clone();

    let mut fixed_header_buffer = [0; FIXED_HEADER_SIZE];
    reader.read_exact(&mut fixed_header_buffer).context()?;

    let compressed_header_size = lh
        .read_lepton_fixed_header(&fixed_header_buffer, &mut enabled_features)
        .context()?;

    lh.read_compressed_lepton_header(reader, &mut enabled_features, compressed_header_size)
        .context()?;

    let mut buf = [0; 3];
    reader.read_exact(&mut buf).context()?;

    if buf != LEPTON_HEADER_COMPLETION_MARKER {
        return err_exit_code(ExitCode::BadLeptonFile, "CMP marker not found");
    }

    let mut state = LeptonFileReader::run_lepton_decoder_threads(
        &lh,
        &enabled_features,
        4,
        thread_pool,
        progressive_decoding_thread,
    )
    .context()?;

    let mut results = Vec::new();

    // process the rest of the file (except for the 4 byte EOF marker)
    let mut extra_buffer = Vec::new();
    loop {
        let b = reader.fill_buf().context()?;
        let b_len = b.len();
        if b_len == 0 {
            break;
        }
        state.process_buffer(&mut PartialBuffer::new(b, &mut extra_buffer))?;
        reader.consume(b_len);

        if let Some(r) = state.retrieve_result(false)? {
            results.push(r);
        }
    }

    while let Some(r) = state.retrieve_result(true)? {
        results.push(r);
    }

    // merge the corresponding components so that we get a single set of coefficient maps (since each thread did a piece of the work)
    let num_components = results[0].len();

    let mut block_image = Vec::new();
    for i in 0..num_components {
        block_image.push(BlockBasedImage::merge(&mut results, i).context()?);
    }

    Ok((lh, block_image))
}

enum DecoderState {
    FixedHeader(),
    CompressedHeader(usize),
    CMP(),
    ScanProgressive(MultiplexReaderState<Vec<BlockBasedImage>>),
    ScanBaseline(MultiplexReaderState<Vec<u8>>),
    EOI,
}

/// A writer that limits the amount of data written to a specified amount, silently truncating any excess data.
///
/// This is used to ensure that we do not write more data than the expected JPEG file size during decoding.
struct LimitedOutputWriter<'a, W: Write> {
    inner: &'a mut W,
    amount_left: &'a mut u64,
}

impl<W: Write> Write for LimitedOutputWriter<'_, W> {
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        // only write up to the amount left
        let to_write = std::cmp::min(buf.len() as u64, *self.amount_left) as usize;
        let written = self.inner.write(&buf[0..to_write])?;

        if written < to_write {
            // short write, propagate since we haven't hit the limit yet
            *self.amount_left -= written as u64;
            Ok(written)
        } else {
            *self.amount_left -= written as u64;

            // always say we wrote everything, the goal here is to silently truncate
            Ok(buf.len())
        }
    }

    fn flush(&mut self) -> std::io::Result<()> {
        self.inner.flush()
    }
}

/// Writes to a fixed size output buffer and queues up any extra data
/// that doesn't fit and writes it out first on the next call.
struct FixedBufferOuputWriter<'a> {
    amount_written: usize,
    output_buffer: &'a mut [u8],
    extra_queue: &'a mut VecDeque<u8>,
}

impl Write for FixedBufferOuputWriter<'_> {
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        let amount_for_output = buf
            .len()
            .min(self.output_buffer.len() - self.amount_written);

        self.output_buffer[self.amount_written..self.amount_written + amount_for_output]
            .copy_from_slice(&buf[..amount_for_output]);
        self.amount_written += amount_for_output;

        if amount_for_output < buf.len() {
            self.extra_queue.extend(&buf[amount_for_output..]);
        }
        Ok(buf.len())
    }

    fn flush(&mut self) -> std::io::Result<()> {
        // nothing to do since we don't buffer anything
        Ok(())
    }
}

/// This is the state machine for the decoder for reading lepton files. The
/// data is pushed into the state machine and processed in chuncks. Once
/// the calculations are done the data is retrieved from the output buffers.
pub struct LeptonFileReader<'a> {
    state: DecoderState,
    lh: Box<LeptonHeader>,
    enabled_features: EnabledFeatures,
    extra_buffer: Vec<u8>,
    metrics: Metrics,
    total_read_size: u64,
    jpeg_file_size_left: u64,
    input_complete: bool,
    thread_pool: ThreadPoolHolder<'a>,
}

impl<'a> LeptonFileReader<'a> {
    /// Creates a new LeptonFileReader.
    pub fn new(features: EnabledFeatures, thread_pool: ThreadPoolHolder<'a>) -> Self {
        LeptonFileReader {
            state: DecoderState::FixedHeader(),
            lh: LeptonHeader::default_boxed(),
            enabled_features: features,
            extra_buffer: Vec::new(),
            metrics: Metrics::default(),
            total_read_size: 0,
            input_complete: false,
            jpeg_file_size_left: 0,
            thread_pool,
        }
    }

    /// Processes a buffer of data of the file, which can be a slice of 0 or more characters.
    /// If the input is complete, then input_complete should be set to true.
    ///
    /// Any available output is written to the output buffer, which can be zero if the
    /// input is not yet complete. Once the input has been marked as complete, the
    /// call will return all remaining output.
    ///
    /// # Arguments
    /// * `input` - The input buffer to process.
    /// * `input_complete` - True if the input is complete and no more data will be provided.
    /// * `writer` - The writer to write the output to.
    pub fn process_buffer(
        &mut self,
        in_buffer: &[u8],
        input_complete: bool,
        output: &mut impl Write,
    ) -> Result<()> {
        if self.input_complete && in_buffer.len() > 0 {
            return err_exit_code(
                ExitCode::SyntaxError,
                "ERROR: input was marked as complete but more data was provided",
            );
        }

        self.total_read_size += in_buffer.len() as u64;

        let mut in_buffer = PartialBuffer::new(in_buffer, &mut self.extra_buffer);
        while in_buffer.continue_processing() {
            match &mut self.state {
                DecoderState::FixedHeader() => {
                    if let Some(v) = in_buffer.take(FIXED_HEADER_SIZE, 0) {
                        let compressed_header_size = self
                            .lh
                            .read_lepton_fixed_header(
                                &v.try_into().unwrap(),
                                &mut self.enabled_features,
                            )
                            .context()?;

                        self.state = DecoderState::CompressedHeader(compressed_header_size);

                        self.jpeg_file_size_left = u64::from(self.lh.jpeg_file_size);
                    }
                }
                DecoderState::CompressedHeader(compressed_length) => {
                    if let Some(v) = in_buffer.take(*compressed_length, 0) {
                        self.lh
                            .read_compressed_lepton_header(
                                &mut Cursor::new(v),
                                &mut self.enabled_features,
                                *compressed_length,
                            )
                            .context()?;

                        // we need to truncate our file to the size minus the garbage data so
                        // that when we hit the garbage data, we have room to write it all out
                        if !self.lh.bad_truncation_version() {
                            self.jpeg_file_size_left -= self.lh.rinfo.garbage_data.len() as u64;
                        }

                        self.state = DecoderState::CMP();
                    }
                }
                DecoderState::CMP() => {
                    if let Some(v) = in_buffer.take(3, 0) {
                        let mut limited_output = LimitedOutputWriter {
                            inner: output,
                            amount_left: &mut self.jpeg_file_size_left,
                        };

                        self.state = Self::process_cmp(
                            v,
                            &self.lh,
                            &self.enabled_features,
                            &self.thread_pool,
                            &mut limited_output,
                        )?;
                    }
                }

                DecoderState::ScanProgressive(state) => {
                    state.process_buffer(&mut in_buffer)?;

                    if input_complete {
                        Self::verify_eof_file_size(self.total_read_size, &mut in_buffer)?;

                        // complete the operation and merge the metrics
                        // progressive JPEGs cannot return partial results
                        let mut results = Vec::new();
                        while let Some(r) = state.retrieve_result(true)? {
                            results.push(r);
                        }

                        let mut limited_output = LimitedOutputWriter {
                            inner: output,
                            amount_left: &mut self.jpeg_file_size_left,
                        };

                        Self::process_progressive(
                            &mut self.lh,
                            &self.enabled_features,
                            results,
                            &mut limited_output,
                        )?;

                        write_tail(&mut self.lh, &mut limited_output)?;

                        // here we write out any garbage data verbatim without truncating it
                        write_garbage_data(&self.lh, limited_output)?;

                        self.metrics.merge_from(state.take_metrics());

                        self.state = DecoderState::EOI;
                    }
                }
                DecoderState::ScanBaseline(state) => {
                    state.process_buffer(&mut in_buffer)?;

                    let mut limited_output = LimitedOutputWriter {
                        inner: output,
                        amount_left: &mut self.jpeg_file_size_left,
                    };

                    // baseline images can return partial results as decoding progresses,
                    // send these to the output by querying the state machine with complete
                    // set to false, which means we won't block if nothing is available
                    while let Some(r) = state.retrieve_result(false)? {
                        limited_output.write_all(&r)?;
                    }

                    if input_complete {
                        Self::verify_eof_file_size(self.total_read_size, &mut in_buffer)?;

                        // once we've complete the input, block for all remaining results
                        while let Some(r) = state.retrieve_result(true)? {
                            limited_output.write_all(&r)?;
                        }

                        // Injection of restart codes for RST errors supports JPEGs with trailing RSTs.
                        // Run this logic even if early_eof_encountered to be compatible with C++ version.
                        //
                        // This logic is no longer needed for Rust generated Lepton files, since we just use the garbage
                        // data to store any extra RST codes or whatever else might be at the end of the file.
                        if self.lh.rinfo.rst_err.len() > 0 {
                            let cumulative_reset_markers = if self.lh.jpeg_header.rsti != 0 {
                                (self.lh.jpeg_header.mcuc - 1) / self.lh.jpeg_header.rsti
                            } else {
                                0
                            } as u8;

                            for i in 0..self.lh.rinfo.rst_err[0] {
                                let rst = jpeg_code::RST0 + ((cumulative_reset_markers + i) & 7);

                                limited_output.write_all(&[0xff, rst])?;
                            }
                        }

                        write_tail(&mut self.lh, &mut limited_output)?;

                        write_garbage_data(&self.lh, limited_output)?;

                        self.metrics.merge_from(state.take_metrics());

                        self.state = DecoderState::EOI;
                    }
                }
                DecoderState::EOI => {
                    break;
                }
            }
        }

        if input_complete {
            self.input_complete = true;
            match self.state {
                DecoderState::EOI => {
                    // all good, we don't need any more data to continue decoding
                }
                _ => {
                    return err_exit_code(ExitCode::SyntaxError,
                    format!("ERROR: input was marked as complete, but the decoder in state {:?} still needs more data",
                    std::mem::discriminant(&self.state)).as_str());
                }
            }
        }

        Ok(())
    }

    /// Processes input data, writing output to the output buffer and any extra to the output_extra queue.
    ///
    /// This is necessary because in the unmanaged wrapper we cannot expand the buffer that was given to us,
    /// so we have to write as much as we can to the output buffer and then queue up any extra data for next time.
    ///
    /// This avoids adding complexity to the main processing loop for dealing with the case where the output
    /// buffer is too small.
    ///
    /// Returns a tuple (complete, amount_written) where complete is true if all output was written.
    pub fn process_limited_buffer(
        &mut self,
        input: &[u8],
        input_complete: bool,
        output_buffer: &mut [u8],
        output_extra: &mut VecDeque<u8>,
    ) -> std::io::Result<(bool, usize)> {
        // first write any extra data we have pending from last time
        let mut amount_written = 0;
        while amount_written < output_buffer.len() && output_extra.len() > 0 {
            amount_written += output_extra
                .read(&mut output_buffer[amount_written..])
                .unwrap();
        }

        // now call process buffer with the remaining space
        let mut w = FixedBufferOuputWriter {
            amount_written,
            output_buffer,
            extra_queue: output_extra,
        };

        self.process_buffer(input, input_complete, &mut w)?;

        Ok((input_complete && w.extra_queue.len() == 0, w.amount_written))
    }

    /// destructively reads the metrics
    pub fn take_metrics(&mut self) -> Metrics {
        mem::take(&mut self.metrics)
    }

    /// return metrics on decoder
    pub fn metrics(&self) -> &Metrics {
        &self.metrics
    }

    fn process_progressive(
        lh: &mut LeptonHeader,
        enabled_features: &EnabledFeatures,
        mut image_segments: Vec<Vec<BlockBasedImage>>,
        output: &mut impl Write,
    ) -> Result<()> {
        let num_components = image_segments[0].len();
        let mut merged = Vec::new();
        for i in 0..num_components {
            merged.push(BlockBasedImage::merge(&mut image_segments, i).context()?);
        }

        output.write_all(&SOI)?;
        output
            .write_all(&lh.rinfo.raw_jpeg_header[0..lh.raw_jpeg_header_read_index])
            .context()?;

        let mut scnc = 0;

        loop {
            // progressive JPEG consists of scans followed by headers
            let scan =
                jpeg_write_entire_scan(&merged[..], &lh.jpeg_header, &lh.rinfo, scnc).context()?;

            output.write_all(&scan).context()?;

            // read the next headers (DHT, etc) while mirroring it back to the writer
            let old_pos = lh.raw_jpeg_header_read_index;
            let result = lh.advance_next_header_segment(enabled_features).context()?;

            output
                .write_all(&lh.rinfo.raw_jpeg_header[old_pos..lh.raw_jpeg_header_read_index])
                .context()?;

            if !result {
                break;
            }

            // advance to next scan
            scnc += 1;
        }

        Ok(())
    }

    fn process_cmp(
        v: Vec<u8>,
        lh: &LeptonHeader,
        enabled_features: &EnabledFeatures,
        thread_pool: &dyn LeptonThreadPool,
        output: &mut impl Write,
    ) -> Result<DecoderState> {
        if v[..] != LEPTON_HEADER_COMPLETION_MARKER {
            return err_exit_code(ExitCode::BadLeptonFile, "CMP marker not found");
        }

        // use progressive logic, which reads the entire block into memory and then performs
        // the jpeg decoding. This permits multiple scans that are each encoded in two cases:
        //  - progressive images
        //  - baseline multiscan images (rare but permitted)
        Ok(if !lh.jpeg_header.is_single_scan() {
            let mux = Self::run_lepton_decoder_threads(
                lh,
                enabled_features,
                4, /* retain the last 4 bytes for the very end, since that is the file size, and shouldn't be parsed */
                thread_pool,
                progressive_decoding_thread,
            )
            .context()?;

            DecoderState::ScanProgressive(mux)
        } else {
            output.write_all(&SOI)?;
            output
                .write_all(&lh.rinfo.raw_jpeg_header[0..lh.raw_jpeg_header_read_index])
                .context()?;

            let mux = Self::run_lepton_decoder_threads(
                &lh,
                &enabled_features,
                4, /*retain 4 bytes for the end for the file size that is appended */
                thread_pool,
                baseline_decoding_thread,
            )?;
            DecoderState::ScanBaseline(mux)
        })
    }

    fn verify_eof_file_size(total_read_size: u64, in_buffer: &mut PartialBuffer<'_>) -> Result<()> {
        if let Some(bytes) = in_buffer.take_n::<4>(0) {
            let size = u32::from_le_bytes(bytes);
            if u64::from(size) != total_read_size {
                return err_exit_code(
                    ExitCode::VerificationLengthMismatch,
                    format!(
                        "ERROR mismatch input_len = {0}, decoded_len = {1}",
                        size, total_read_size
                    ),
                );
            }
            Ok(())
        } else {
            err_exit_code(
                ExitCode::VerificationLengthMismatch,
                "Missing EOF file size",
            )
        }
    }

    /// starts the decoder threads
    fn run_lepton_decoder_threads<P: Send + 'static>(
        lh: &LeptonHeader,
        features: &EnabledFeatures,
        retention_bytes: usize,
        thread_pool: &dyn LeptonThreadPool,
        process: fn(
            reader: &mut MultiplexReader,
            features: &EnabledFeatures,
            qt: &[QuantizationTables],
            thread_handoff: &ThreadHandoff,
            jpeg_header: &JpegHeader,
            rinfo: &ReconstructionInfo,
            is_last_thread: bool,
            sender: &Sender<MultiplexReadResult<P>>,
        ) -> Result<()>,
    ) -> Result<MultiplexReaderState<P>> {
        let qt = QuantizationTables::construct_quantization_tables(&lh.jpeg_header)?;

        let features = features.clone();

        let thread_handoff = lh.thread_handoff.clone();

        let jpeg_header = lh.jpeg_header.clone();
        let rinfo = lh.rinfo.clone();

        let multiplex_reader_state = multiplex_read(
            thread_handoff.len(),
            features.max_processor_threads as usize,
            thread_pool,
            retention_bytes,
            move |thread_id, reader, result_tx| {
                process(
                    reader,
                    &features,
                    &qt,
                    &thread_handoff[thread_id],
                    &jpeg_header,
                    &rinfo,
                    thread_id == thread_handoff.len() - 1,
                    result_tx,
                )
            },
        );

        Ok(multiplex_reader_state)
    }
}

fn write_tail(lh: &mut LeptonHeader, output: &mut impl Write) -> Result<()> {
    output
        .write_all(&lh.rinfo.raw_jpeg_header[lh.raw_jpeg_header_read_index..])
        .context()?;
    Ok(())
}

/// The thread function for progressive decoding.
///
/// Progressive encoding runs multiple passes on the same image data,
/// so we can only calculate the set of images in parallel, and then
/// merge them together into a single image that the progressive JPEG
/// writer can use to write out the full progressive scan data.
fn progressive_decoding_thread(
    reader: &mut MultiplexReader,
    features: &EnabledFeatures,
    qt: &[QuantizationTables],
    thread_handoff: &ThreadHandoff,
    jpeg_header: &JpegHeader,
    rinfo: &ReconstructionInfo,
    is_last_thread: bool,
    sender: &Sender<MultiplexReadResult<Vec<BlockBasedImage>>>,
) -> Result<()> {
    let cpu_time: CpuTimeMeasure = CpuTimeMeasure::new();

    let (mut metrics, image_data) = lepton_decode_row_range(
        qt,
        jpeg_header,
        &rinfo.truncate_components,
        reader,
        thread_handoff.luma_y_start,
        thread_handoff.luma_y_end,
        is_last_thread,
        true,
        features,
        |_, _| Ok(()),
    )?;

    metrics.record_cpu_worker_time(cpu_time.elapsed());

    sender.send(MultiplexReadResult::Result(image_data))?;
    sender.send(MultiplexReadResult::Complete(metrics))?;

    Ok(())
}

/// The thread function for baseline decoding.
///
/// Baseline encoding can do both the image decoding and JPEG writing in parallel.
/// Each thread decodes its own segment and writes out the JPEG data bytes for that segment.
fn baseline_decoding_thread(
    reader: &mut MultiplexReader,
    features: &EnabledFeatures,
    qt: &[QuantizationTables],
    thread_handoff: &ThreadHandoff,
    jpeg_header: &JpegHeader,
    rinfo: &ReconstructionInfo,
    is_last_thread: bool,
    sender: &Sender<MultiplexReadResult<Vec<u8>>>,
) -> Result<()> {
    let cpu_time: CpuTimeMeasure = CpuTimeMeasure::new();

    let restart_info = RestartSegmentCodingInfo {
        overhang_byte: thread_handoff.overhang_byte,
        num_overhang_bits: thread_handoff.num_overhang_bits,
        luma_y_start: thread_handoff.luma_y_start,
        luma_y_end: thread_handoff.luma_y_end,
        last_dc: thread_handoff.last_dc,
    };

    const BUFFER_SIZE: usize = 128 * 1024;

    // track how muchd data we can generate
    let mut amount_left = thread_handoff.segment_size as usize;

    let mut inc_writer =
        JpegIncrementalWriter::new(BUFFER_SIZE, rinfo, Some(&restart_info), jpeg_header, 0);

    let (mut metrics, _image_data) = lepton_decode_row_range(
        qt,
        jpeg_header,
        &rinfo.truncate_components,
        reader,
        thread_handoff.luma_y_start,
        thread_handoff.luma_y_end,
        is_last_thread,
        true,
        features,
        |row_spec, image_data| {
            inc_writer.process_row(row_spec, image_data).context()?;

            // send out any data we have buffered if we have enough
            if inc_writer.amount_buffered() >= BUFFER_SIZE {
                let mut buf = inc_writer.detach_buffer();
                if buf.len() > amount_left {
                    warn!(
                        "Truncating output buffer from {} to {}",
                        buf.len(),
                        amount_left
                    );
                    buf.truncate(amount_left);
                }

                amount_left -= buf.len();

                sender.send(MultiplexReadResult::Result(buf))?;
            }

            Ok(())
        },
    )?;

    metrics.record_cpu_worker_time(cpu_time.elapsed());

    let mut buf = inc_writer.detach_buffer();
    if buf.len() > amount_left {
        warn!(
            "Truncating output buffer from {} to {}",
            buf.len(),
            amount_left
        );
        buf.truncate(amount_left);
    }

    sender.send(MultiplexReadResult::Result(buf))?;

    sender.send(MultiplexReadResult::Complete(metrics))?;

    Ok(())
}

fn write_garbage_data(
    lh: &LeptonHeader,
    mut limited_output: LimitedOutputWriter<'_, impl Write>,
) -> Result<()> {
    if !lh.bad_truncation_version() {
        // here we write out any garbage data verbatim without truncating it
        // (since we already shrunk the max file size accordingly)
        limited_output
            .inner
            .write_all(&lh.rinfo.garbage_data)
            .context()?;
    } else {
        // the bad encoder wrote the garbage data in such a way that it could
        // be truncated (see DecoderState::CompressedHeader case above)
        limited_output.write_all(&lh.rinfo.garbage_data).context()?;
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::{BufWriter, Cursor};

    use default_boxed::DefaultBoxed;

    use crate::{
        DEFAULT_THREAD_POOL, EnabledFeatures, SingleThreadPool, decode_lepton,
        helpers::read_file,
        structs::{
            lepton_header::{FIXED_HEADER_SIZE, LeptonHeader},
            thread_handoff::ThreadHandoff,
        },
    };

    // test serializing and deserializing header
    #[test]
    fn parse_and_write_header() {
        use crate::jpeg::jpeg_read::read_jpeg_file;
        use std::io::Read;

        let min_jpeg = read_file("tiny", ".jpg");

        let mut lh = LeptonHeader::default_boxed();
        let enabled_features = EnabledFeatures::compat_lepton_vector_read();

        lh.jpeg_file_size = min_jpeg.len() as u32;
        lh.uncompressed_lepton_header_size = Some(752);

        let (_image_data, _partitions, _end_scan) = read_jpeg_file(
            &mut Cursor::new(min_jpeg),
            &mut lh.jpeg_header,
            &mut lh.rinfo,
            &enabled_features,
            |_, _| {},
        )
        .unwrap();

        lh.thread_handoff.push(ThreadHandoff {
            luma_y_start: 0,
            luma_y_end: 1,
            segment_offset_in_file: 0,
            segment_size: 1000,
            overhang_byte: 0,
            num_overhang_bits: 1,
            last_dc: [1, 2, 3, 4],
        });

        let mut serialized = Vec::new();
        lh.write_lepton_header(&mut Cursor::new(&mut serialized), &enabled_features)
            .unwrap();

        let mut other = LeptonHeader::default_boxed();
        let mut other_reader = Cursor::new(&serialized);

        let mut fixed_buffer = [0; FIXED_HEADER_SIZE];
        other_reader.read_exact(&mut fixed_buffer).unwrap();

        let mut other_enabled_features = EnabledFeatures::compat_lepton_vector_read();

        let compressed_header_size = other
            .read_lepton_fixed_header(&fixed_buffer, &mut other_enabled_features)
            .unwrap();
        other
            .read_compressed_lepton_header(
                &mut other_reader,
                &mut other_enabled_features,
                compressed_header_size,
            )
            .unwrap();

        assert_eq!(
            lh.uncompressed_lepton_header_size,
            other.uncompressed_lepton_header_size
        );
    }

    #[test]
    fn test_simple_parse_progressive() {
        test_file("androidprogressive")
    }

    #[test]
    fn test_simple_parse_baseline() {
        test_file("android")
    }

    #[test]
    fn test_simple_parse_trailing() {
        test_file("androidtrail")
    }

    #[test]
    fn test_zero_dqt() {
        test_file("zeros_in_dqt_tables")
    }

    /// truncated progessive JPEG. We don't support creating these, but we can read them
    #[test]
    fn test_pixelated() {
        test_file("pixelated")
    }

    /// requires that the last segment be truncated by 1 byte.
    /// This is for compatibility with the C++ version
    #[test]
    fn test_truncate4() {
        test_file("truncate4")
    }

    #[test]
    fn test_decode_single_threaded() {
        let filename = "iphone";
        let file = read_file(filename, ".lep");
        let original = read_file(filename, ".jpg");

        let enabled_features = EnabledFeatures::compat_lepton_vector_read();

        let mut output = Vec::new();
        decode_lepton(
            &mut Cursor::new(&file),
            &mut output,
            &enabled_features,
            &SingleThreadPool::default(),
        )
        .unwrap();

        assert_eq!(output.len(), original.len());
        assert!(output == original);
    }

    #[test]
    fn test_encode_single_threaded() {
        let filename = "iphone";
        let file = read_file(filename, ".jpg");

        let enabled_features = EnabledFeatures::compat_lepton_vector_read();

        let mut output = Vec::new();
        crate::encode_lepton(
            &mut Cursor::new(&file),
            &mut Cursor::new(&mut output),
            &enabled_features,
            &SingleThreadPool::default(),
        )
        .unwrap();
    }

    fn test_file(filename: &str) {
        let file = read_file(filename, ".lep");
        let original = read_file(filename, ".jpg");

        let enabled_features = EnabledFeatures::compat_lepton_vector_read();

        let _ = decode_lepton_file_image(
            &mut Cursor::new(&file),
            &enabled_features,
            &DEFAULT_THREAD_POOL,
        )
        .unwrap();

        let mut output = Vec::new();

        decode_lepton(
            &mut Cursor::new(&file),
            &mut output,
            &enabled_features,
            &DEFAULT_THREAD_POOL,
        )
        .unwrap();

        assert_eq!(output.len(), original.len());
        assert!(output == original);
    }

    struct RecordStreamPosition<W: Write> {
        writer: W,
        position: u64,
    }

    impl<W: Write> Write for RecordStreamPosition<W> {
        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
            if buf.len() == 0 {
                return Ok(0);
            }

            // only accept one byte at a time to test position tracking
            let n = self.writer.write(&[buf[0]])?;
            self.position += n as u64;
            Ok(n)
        }

        fn flush(&mut self) -> std::io::Result<()> {
            self.writer.flush()
        }
    }

    #[test]
    fn test_streaming_results() {
        let data = read_file("hq", ".lep");
        let original_data = read_file("hq", ".jpg");

        let mut cursor = Cursor::new(&data);

        let mut output_vector = Vec::new();

        let mut output = RecordStreamPosition {
            writer: BufWriter::new(&mut output_vector),
            position: 0,
        };

        let enabled_features = EnabledFeatures::compat_lepton_vector_read();
        let thread_pool = &DEFAULT_THREAD_POOL;

        decode_lepton(&mut cursor, &mut output, &enabled_features, thread_pool).unwrap();

        drop(output);

        assert_eq!(output_vector.len(), original_data.len());
        assert!(output_vector == original_data);
    }

    /// ensure we fail if the output buffer is too small
    #[test]
    fn test_too_small_output() {
        let original = read_file("slrcity", ".lep");

        let mut output = Vec::new();
        output.resize(original.len() / 2, 0u8);

        let r = decode_lepton(
            &mut Cursor::new(&original),
            &mut Cursor::new(&mut output[..]),
            &EnabledFeatures::compat_lepton_vector_read(),
            &DEFAULT_THREAD_POOL,
        );

        assert!(r.is_err() && r.err().unwrap().exit_code() == ExitCode::OsError);
    }

    fn verifydecode(filename: &str) {
        let original = read_file(filename, ".lep");

        let mut output = Vec::new();

        let _ = decode_lepton(
            &mut Cursor::new(&original),
            &mut Cursor::new(&mut output),
            &EnabledFeatures::compat_lepton_vector_read(),
            &DEFAULT_THREAD_POOL,
        )
        .unwrap();

        let jpg = read_file(filename, ".jpg");

        assert_eq!(jpg.len(), output.len());
        assert!(output == jpg);
    }

    /// test we can decode an invalid file generated by a regression in the 5.5 version,
    /// which triggers the bad_truncation_version() check in the LeptonHeader.
    #[test]
    fn test_truncated_with_bad_truncation_version() {
        verifydecode("half_scan_rust55");
    }

    /// test we can decode the same file as above, but encoded by a
    /// correctly behaving decoder
    #[test]
    fn test_truncated_with_ok_truncation_version() {
        verifydecode("half_scan");
    }

    /// tests corner case where we have garbage data due to the trunction of the file,
    /// but the garbage data is not actually valid JPEG data. So basically what happened
    /// was that the file got truncated mid-byte and the remaining bits are just random.
    #[test]
    fn test_truncated_with_bad_garbage_data() {
        verifydecode("truncbad");
    }
}


================================================
FILE: lib/src/structs/lepton_file_writer.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::cmp;
use std::io::{BufRead, Cursor, Seek, Write};
use std::time::Instant;

use byteorder::{LittleEndian, WriteBytesExt};
use default_boxed::DefaultBoxed;
use log::info;

use crate::enabled_features::EnabledFeatures;
use crate::jpeg::block_based_image::BlockBasedImage;
use crate::jpeg::jpeg_header::JpegHeader;
use crate::jpeg::jpeg_read::read_jpeg_file;
use crate::jpeg::truncate_components::TruncateComponents;
use crate::lepton_error::{AddContext, ExitCode, Result, err_exit_code};
use crate::metrics::{CpuTimeMeasure, Metrics};
use crate::structs::lepton_encoder::lepton_encode_row_range;
use crate::structs::lepton_file_reader::decode_lepton;
use crate::structs::lepton_header::LeptonHeader;
use crate::structs::multiplexer::multiplex_write;
use crate::structs::quantization_tables::QuantizationTables;
use crate::structs::thread_handoff::ThreadHandoff;
use crate::{LeptonThreadPool, StreamPosition, consts::*};

/// Reads a jpeg and writes it out as a lepton file
///
/// # Parameters
/// - `reader`: A buffered reader from which the JPEG data is read.
/// - `writer`: A writer to which the Lepton-encoded data is written.
/// - `enabled_features`: A set of toggles for enabling/disabling encoding features/restrictions.
/// - `thread_pool`: A reference to a thread pool used for parallel processing. Must be a static reference and
/// can point to `DEFAULT_THREAD_POOL`.
pub fn encode_lepton<R: BufRead + Seek, W: Write + StreamPosition>(
    reader: &mut R,
    writer: &mut W,
    enabled_features: &EnabledFeatures,
    thread_pool: &dyn LeptonThreadPool,
) -> Result<Metrics> {
    let (lp, image_data) = read_jpeg(reader, enabled_features, |_jh, _ri| {})?;

    let start_position = writer.position();

    lp.write_lepton_header(writer, enabled_features).context()?;

    let metrics = run_lepton_encoder_threads(
        &lp.jpeg_header,
        &lp.rinfo.truncate_components,
        writer,
        &lp.thread_handoff[..],
        image_data,
        enabled_features,
        thread_pool,
    )
    .context()?;

    let final_file_size = (writer.position() - start_position) + 4;

    writer
        .write_u32::<LittleEndian>(final_file_size as u32)
        .context()?;

    Ok(metrics)
}

/// Encodes JPEG as compressed Lepton format, verifies roundtrip in buffer. Requires everything to be buffered
/// since we need to pass through the data multiple times
pub fn encode_lepton_verify(
    input_data: &[u8],
    enabled_features: &EnabledFeatures,
    thread_pool: &dyn LeptonThreadPool,
) -> Result<(Vec<u8>, Metrics)> {
    let mut output_data = Vec::with_capacity(input_data.len());

    info!("compressing to Lepton format");

    let mut reader = Cursor::new(&input_data);
    let mut writer = Cursor::new(&mut output_data);

    let mut metrics =
        encode_lepton(&mut reader, &mut writer, &enabled_features, thread_pool).context()?;

    // decode and compare to original in order to enure we encoded correctly

    let mut verify_buffer = Vec::with_capacity(input_data.len());
    let mut verifyreader = Cursor::new(&output_data[..]);

    info!("decompressing to verify contents");

    let mut c = enabled_features.clone();

    metrics.merge_from(
        decode_lepton(&mut verifyreader, &mut verify_buffer, &mut c, thread_pool).context()?,
    );

    if input_data.len() != verify_buffer.len() {
        return err_exit_code(
            ExitCode::VerificationLengthMismatch,
            format!(
                "ERROR mismatch input_len = {0}, decoded_len = {1}",
                input_data.len(),
                verify_buffer.len()
            ),
        );
    }

    if input_data[..] != verify_buffer[..] {
        return err_exit_code(
            ExitCode::VerificationContentMismatch,
            "ERROR mismatching data (but same size)",
        );
    }

    Ok((output_data, metrics))
}

/// reads JPEG and returns corresponding header and image vector. This encapsulate all
/// JPEG reading code, including baseline and progressive images.
///
/// The callback is called for each jpeg header that is parsed, which
/// is currently only used by the dump utility for debugging purposes.
pub fn read_jpeg<R: BufRead + Seek>(
    reader: &mut R,
    enabled_features: &EnabledFeatures,
    callback: fn(&JpegHeader, &[u8]),
) -> Result<(Box<LeptonHeader>, Vec<BlockBasedImage>)> {
    let mut lp = LeptonHeader::default_boxed();

    let stream_start_position = reader.stream_position().context()?;

    get_git_revision(&mut lp);

    let (image_data, partitions, end_scan) = read_jpeg_file(
        reader,
        &mut lp.jpeg_header,
        &mut lp.rinfo,
        enabled_features,
        callback,
    )?;

    let mut thread_handoff = Vec::<ThreadHandoff>::new();

    for i in 0..partitions.len() {
        let (segment_offset, r) = &partitions[i];

        let segment_size = if i == partitions.len() - 1 {
            end_scan - segment_offset
        } else {
            partitions[i + 1].0 - segment_offset
        };

        thread_handoff.push(ThreadHandoff {
            segment_offset_in_file: (*segment_offset - stream_start_position)
                .try_into()
                .unwrap(),
            luma_y_start: r.luma_y_start,
            luma_y_end: r.luma_y_end,
            overhang_byte: r.overhang_byte,
            num_overhang_bits: r.num_overhang_bits,
            last_dc: r.last_dc,
            segment_size: segment_size.try_into().unwrap(),
        });

        #[cfg(feature = "detailed_tracing")]
        info!(
            "Crystalize: s:{0} ls: {1} le: {2} o: {3} nb: {4}",
            thread_handoff[i].segment_offset_in_file,
            thread_handoff[i].luma_y_start,
            thread_handoff[i].luma_y_end,
            thread_handoff[i].overhang_byte,
            thread_handoff[i].num_overhang_bits
        );
    }

    let merged_handoffs = split_row_handoffs_to_threads(
        &thread_handoff[..],
        enabled_features.max_partitions as usize,
    );
    lp.thread_handoff = merged_handoffs;
    lp.jpeg_file_size = (reader.stream_position().context()? - stream_start_position) as u32;

    if lp.jpeg_file_size > enabled_features.max_jpeg_file_size {
        return err_exit_code(
            ExitCode::UnsupportedJpeg,
            "file is too large to encode, increase max_jpeg_file_size",
        );
    }

    Ok((lp, image_data))
}

const fn string_to_int(s: &str) -> u8 {
    let mut result = 0;
    let mut i = 0;
    let b = s.as_bytes();
    while i < b.len() {
        let c = b[i];
        result = result * 10 + c - b'0';
        i += 1;
    }
    result
}

static GIT_VERSION: &str = git_version::git_version!(
    args = ["--abbrev=40", "--always", "--dirty=M"],
    fallback = "0"
);

/// Returns the git version used to build this libary as a static string.
pub fn get_git_version() -> &'static str {
    GIT_VERSION
}

pub fn get_cargo_pkg_version() -> u8 {
    string_to_int(env!("CARGO_PKG_VERSION_MAJOR")) * 100
        + string_to_int(env!("CARGO_PKG_VERSION_MINOR")) * 10
        + string_to_int(env!("CARGO_PKG_VERSION_PATCH"))
}

fn get_git_revision(lp: &mut LeptonHeader) {
    let hex_str = GIT_VERSION;
    if let Ok(v) = u32::from_str_radix(hex_str, 16) {
        // place the warning if we got a git revision. The --dirty=M suffix means that some files
        // were modified so the version is not a clean git version, so we don't write it.
        lp.git_revision_prefix = v.to_be_bytes();
    }

    lp.encoder_version = get_cargo_pkg_version();
}

/// runs the encoding threads and returns the total amount of CPU time consumed (including worker threads)
fn run_lepton_encoder_threads<W: Write>(
    jpeg_header: &JpegHeader,
    colldata: &TruncateComponents,
    writer: &mut W,
    thread_handoffs: &[ThreadHandoff],
    image_data: Vec<BlockBasedImage>,
    features: &EnabledFeatures,
    thread_pool: &dyn LeptonThreadPool,
) -> Result<Metrics> {
    let wall_time = Instant::now();

    // Get number of threads. Verify that it is at most MAX_THREADS and fits in 4 bits for serialization.
    let num_threads = thread_handoffs.len();
    assert!(
        num_threads <= MAX_THREADS_SUPPORTED_BY_LEPTON_FORMAT,
        "Too many thread handoffs"
    );

    // Prepare quantization tables
    let quantization_tables = QuantizationTables::construct_quantization_tables(jpeg_header)?;

    let colldata = colldata.clone();
    let thread_handoffs = thread_handoffs.to_vec();
    let features = features.clone();

    let mut thread_results = multiplex_write(
        writer,
        thread_handoffs.len(),
        features.max_processor_threads as usize,
        thread_pool,
        move |thread_writer, thread_id| {
            let cpu_time = CpuTimeMeasure::new();

            let mut range_metrics = lepton_encode_row_range(
                &quantization_tables,
                &image_data,
                thread_writer,
                thread_id as i32,
                &colldata,
                thread_handoffs[thread_id].luma_y_start,
                thread_handoffs[thread_id].luma_y_end,
                thread_id == thread_handoffs.len() - 1,
                true,
                &features,
            )
            .context()?;

            range_metrics.record_cpu_worker_time(cpu_time.elapsed());

            Ok(range_metrics)
        },
    )?;

    let mut merged_metrics = Metrics::default();

    for result in thread_results.drain(..) {
        merged_metrics.merge_from(result);
    }

    info!(
        "worker threads {0}ms of CPU time in {1}ms of wall time",
        merged_metrics.get_cpu_time_worker_time().as_millis(),
        wall_time.elapsed().as_millis()
    );

    Ok(merged_metrics)
}

fn split_row_handoffs_to_threads(
    thread_handoffs: &[ThreadHandoff],
    max_threads_to_use: usize,
) -> Vec<ThreadHandoff> {
    let last = thread_handoffs.last().unwrap();

    let framebuffer_byte_size = ThreadHandoff::get_combine_thread_range_segment_size(
        thread_handoffs.first().unwrap(),
        last,
    );

    // determine how many threads we need for compression
    let num_rows = thread_handoffs.len();
    let num_threads =
        get_number_of_threads_for_encoding(num_rows, framebuffer_byte_size, max_threads_to_use);

    info!("Number of threads: {0}", num_threads);

    let mut selected_splits = Vec::with_capacity(num_threads);

    if num_threads == 1 {
        // Single thread execution - no split, run on the whole range
        selected_splits.push(ThreadHandoff::combine_thread_ranges(
            thread_handoffs.first().unwrap(),
            last,
        ));
    } else {
        // gbrovman: simplified split logic
        // Note: rowsPerThread is a floating point value to ensure equal splits
        let rows_per_thread = num_rows as f32 / num_threads as f32;

        assert!(rows_per_thread >= 1f32, "rowsPerThread >= 1");

        let mut split_indices = Vec::new();
        for i in 0..num_threads - 1 {
            split_indices.push((rows_per_thread * (i as f32 + 1f32)) as usize);
        }

        for i in 0..num_threads {
            let beginning_of_range = if i == 0 { 0 } else { split_indices[i - 1] + 1 };
            let end_of_range = if i == num_threads - 1 {
                num_rows - 1
            } else {
                split_indices[i]
            };
            assert!(end_of_range < num_rows, "endOfRange < numRows");
            selected_splits.push(ThreadHandoff::combine_thread_ranges(
                &thread_handoffs[beginning_of_range],
                &thread_handoffs[end_of_range],
            ));
        }
    }

    return selected_splits;
}

fn get_number_of_threads_for_encoding(
    num_rows: usize,
    framebuffer_byte_size: usize,
    max_threads_to_use: usize,
) -> usize {
    let mut num_threads = cmp::min(max_threads_to_use, MAX_THREADS_SUPPORTED_BY_LEPTON_FORMAT);

    if num_rows / 2 < num_threads {
        num_threads = cmp::max(num_rows / 2, 1);
    }

    if framebuffer_byte_size < SMALL_FILE_BYTES_PER_ENCDOING_THREAD {
        num_threads = 1;
    } else if framebuffer_byte_size < SMALL_FILE_BYTES_PER_ENCDOING_THREAD * 2 {
        num_threads = cmp::min(2, num_threads);
    } else if framebuffer_byte_size < SMALL_FILE_BYTES_PER_ENCDOING_THREAD * 4 {
        num_threads = cmp::min(4, num_threads);
    }

    return num_threads;
}

#[cfg(test)]
mod tests {
    use super::*;

    use crate::{DEFAULT_THREAD_POOL, helpers::read_file};

    #[test]
    fn test_get_git_revision() {
        let mut lh = LeptonHeader::default_boxed();
        get_git_revision(&mut lh);

        println!("{:x?}", lh.git_revision_prefix);
    }

    /// ensure we fail if the output buffer is too small
    #[test]
    fn test_too_small_output() {
        let original = read_file("slrcity", ".jpg");

        let mut output = Vec::new();
        output.resize(original.len() / 2, 0u8);

        let r = encode_lepton(
            &mut Cursor::new(&original),
            &mut Cursor::new(&mut output[..]),
            &EnabledFeatures::compat_lepton_vector_write(),
            &DEFAULT_THREAD_POOL,
        );

        assert!(r.is_err() && r.err().unwrap().exit_code() == ExitCode::OsError);
    }

    #[test]
    fn test_slrcity() {
        test_file("slrcity")
    }

    fn test_file(filename: &str) {
        let original = read_file(filename, ".jpg");

        let mut enabled_features = EnabledFeatures::compat_lepton_vector_write();
        enabled_features.max_partitions = 2;

        let mut output = Vec::new();

        let _ = encode_lepton(
            &mut Cursor::new(&original),
            &mut Cursor::new(&mut output),
            &enabled_features,
            &DEFAULT_THREAD_POOL,
        )
        .unwrap();

        println!(
            "Original size: {0}, compressed size: {1}",
            original.len(),
            output.len()
        );

        let mut recreate = Vec::new();

        decode_lepton(
            &mut Cursor::new(&output),
            &mut recreate,
            &enabled_features,
            &DEFAULT_THREAD_POOL,
        )
        .unwrap();

        assert_eq!(original.len(), recreate.len());
        assert!(original == recreate);
    }
}


================================================
FILE: lib/src/structs/lepton_header.rs
================================================
use std::cmp::min;
use std::io::{Cursor, ErrorKind, Read, Seek, Write};

use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use default_boxed::DefaultBoxed;
use flate2::Compression;
use flate2::read::ZlibDecoder;
use flate2::write::ZlibEncoder;

use crate::EnabledFeatures;
use crate::consts::*;
use crate::helpers::buffer_prefix_matches_marker;
use crate::jpeg::jpeg_header::{JpegHeader, ReconstructionInfo};
use crate::lepton_error::{AddContext, ExitCode, Result, err_exit_code};
use crate::structs::thread_handoff::ThreadHandoff;

pub const FIXED_HEADER_SIZE: usize = 28;

#[derive(Debug, DefaultBoxed)]
pub struct LeptonHeader {
    /// how far we have read into the raw header, since the header is divided
    /// into multiple chucks for each scan. For example, a progressive image
    /// would start with the jpeg image segments, followed by a SOS (start of scan)
    /// after which comes the encoded jpeg coefficients, and once thats over
    /// we get another header segment until the next SOS, etc
    pub raw_jpeg_header_read_index: usize,

    pub thread_handoff: Vec<ThreadHandoff>,

    pub jpeg_header: JpegHeader,

    pub rinfo: ReconstructionInfo,

    pub jpeg_file_size: u32,

    /// on decompression, uncompressed lepton header size. This is only
    /// saved by this encoder for historical reasons. It is not used by
    /// the decoder.
    pub uncompressed_lepton_header_size: Option<u32>,

    /// the git revision of the encoder that created this file (first 8 hex characters)
    pub git_revision_prefix: [u8; 4],

    /// writer version
    pub encoder_version: u8,
}

impl LeptonHeader {
    /// For certain versions of the rust encoder, we didn't handle truncation and corruption correctly.
    /// The correct behavior is to truncate the JPEG generated data up to the file size minus the garbage data,
    /// then write out the garbage data.
    ///
    /// The incorrect behavior was to write out the JPEG data, append the garbage data, and then truncate.
    pub fn bad_truncation_version(&self) -> bool {
        self.encoder_version == 55
    }

    pub fn read_lepton_fixed_header(
        &mut self,
        header: &[u8; FIXED_HEADER_SIZE],
        enabled_features: &mut EnabledFeatures,
    ) -> Result<usize> {
        if header[0..2] != LEPTON_FILE_HEADER[0..2] {
            return err_exit_code(ExitCode::BadLeptonFile, "header doesn't match");
        }
        if header[2] != LEPTON_VERSION {
            return err_exit_code(
                ExitCode::VersionUnsupported,
                format!("incompatible file with version {0}", header[3]),
            );
        }
        if header[3] != LEPTON_HEADER_BASELINE_JPEG_TYPE[0]
            && header[3] != LEPTON_HEADER_PROGRESSIVE_JPEG_TYPE[0]
        {
            return err_exit_code(
                ExitCode::BadLeptonFile,
                format!("Unknown filetype in header {0}", header[4]),
            );
        }

        // header[4] is the number of streams/threads, but we don't care about that
        // header[5..8] is reserved

        // header[8..20] 12 bytes were the GIT revision, but for historical reasons we
        // also use this space to store the uncompressed lepton header size plus some
        // flags to detect the SIMD flavor that was used to encode, since
        // previously the encoder would generate different incompatible files depending on
        // whether SIMD or scalar was selected by the build options.
        if header[8] == 'M' as u8 && header[9] == 'S' as u8 {
            self.uncompressed_lepton_header_size =
                Some(u32::from_le_bytes(header[10..14].try_into().unwrap()));

            // read the flag bits to know how we should decode this file
            let flags = header[14];
            if (flags & 0x80) != 0 {
                enabled_features.use_16bit_dc_estimate = (flags & 0x01) != 0;
                enabled_features.use_16bit_adv_predict = (flags & 0x02) != 0;
            }

            self.encoder_version = header[15];
            self.git_revision_prefix = header[16..20].try_into().unwrap();
        } else {
            // take first bytes for git revision prefix
            self.git_revision_prefix = header[8..12].try_into().unwrap();
        }

        // total size of original JPEG
        self.jpeg_file_size = u32::from_le_bytes(header[20..24].try_into().unwrap());

        let compressed_header_size =
            u32::from_le_bytes(header[24..28].try_into().unwrap()) as usize;

        Ok(compressed_header_size)
    }

    /// reads the start of the lepton file and parses the compressed header. Returns the raw JPEG header contents.
    pub fn read_compressed_lepton_header<R: Read>(
        &mut self,
        reader: &mut R,
        enabled_features: &mut EnabledFeatures,
        compressed_header_size: usize,
    ) -> Result<()> {
        if compressed_header_size > enabled_features.max_jpeg_file_size as usize {
            return err_exit_code(ExitCode::BadLeptonFile, "Too big compressed header");
        }
        if self.jpeg_file_size > enabled_features.max_jpeg_file_size {
            return err_exit_code(
                ExitCode::BadLeptonFile,
                format!(
                    "Only support images < {} megs",
                    enabled_features.max_jpeg_file_size / (1024 * 1024)
                ),
            );
        }

        // limit reading to the compressed header
        let mut compressed_reader = reader.take(compressed_header_size as u64);

        self.rinfo.raw_jpeg_header = self
            .read_lepton_compressed_header(&mut compressed_reader)
            .context()?;

        self.raw_jpeg_header_read_index = 0;

        {
            let mut header_data_cursor = Cursor::new(&self.rinfo.raw_jpeg_header[..]);
            self.jpeg_header
                .parse(&mut header_data_cursor, &enabled_features)
                .context()?;
            self.raw_jpeg_header_read_index = header_data_cursor.position() as usize;
        }

        self.rinfo.truncate_components.init(&self.jpeg_header);

        if self.rinfo.early_eof_encountered {
            self.rinfo
                .truncate_components
                .set_truncation_bounds(&self.jpeg_header, self.rinfo.max_dpos);
        }

        let num_threads = self.thread_handoff.len();

        // luma_y_end of the last thread is not serialized/deserialized, fill it here
        let max_luma = self.rinfo.truncate_components.get_block_height(0);

        for i in 0..num_threads {
            self.thread_handoff[i].luma_y_start =
                min(self.thread_handoff[i].luma_y_start, max_luma);
            self.thread_handoff[i].luma_y_end = min(self.thread_handoff[i].luma_y_end, max_luma);
        }
        self.thread_handoff[num_threads - 1].luma_y_end = max_luma;

        // if the last segment was too big to fit with the garbage data taken into account, shorten it
        // (a bit of broken logic in the encoder, but can't change it without breaking the file format)
        if self.rinfo.early_eof_encountered {
            let mut max_last_segment_size = self.jpeg_file_size
                - u32::try_from(self.rinfo.garbage_data.len())?
                - u32::try_from(self.raw_jpeg_header_read_index)?
                - u32::try_from(SOI.len())?;

            // subtract the segment sizes of all the previous segments (except for the last)
            for i in 0..num_threads - 1 {
                max_last_segment_size -= self.thread_handoff[i].segment_size;
            }

            let last = &mut self.thread_handoff[num_threads - 1];

            let max_last_segment_size = max_last_segment_size;

            if last.segment_size > max_last_segment_size {
                // re-adjust the last segment size
                last.segment_size = max_last_segment_size;
            }
        }

        Ok(())
    }

    /// parses and advances to the next header segment out of raw_jpeg_header into the jpeg header
    pub fn advance_next_header_segment(
        &mut self,
        enabled_features: &EnabledFeatures,
    ) -> Result<bool> {
        let mut header_cursor =
            Cursor::new(&self.rinfo.raw_jpeg_header[self.raw_jpeg_header_read_index..]);

        let result = self
            .jpeg_header
            .parse(&mut header_cursor, enabled_features)
            .context()?;

        self.raw_jpeg_header_read_index += header_cursor.stream_position()? as usize;

        Ok(result)
    }

    /// helper for read_lepton_header. uncompresses and parses the contents of the compressed header. Returns the raw JPEG header.
    fn read_lepton_compressed_header<R: Read>(&mut self, src: &mut R) -> Result<Vec<u8>> {
        let mut header_reader = ZlibDecoder::new(src);

        let mut hdr_buf: [u8; 3] = [0; 3];
        header_reader.read_exact(&mut hdr_buf)?;

        if !buffer_prefix_matches_marker(hdr_buf, LEPTON_HEADER_MARKER) {
            return err_exit_code(ExitCode::BadLeptonFile, "HDR marker not found");
        }

        let hdrs = header_reader.read_u32::<LittleEndian>()? as usize;

        let mut hdr_data = Vec::new();
        hdr_data.resize(hdrs, 0);
        header_reader.read_exact(&mut hdr_data)?;

        if self.rinfo.garbage_data.len() == 0 {
            // if we don't have any garbage, assume 0xFF 0xD9 EOI (end of image marker)

            // Kind of broken logic since this assumes a EOI even if the file was
            // truncated at the EOI, but this is what the file format is.
            // In this case, this marker will be chopped off later by the
            // overall JPEG file size limit, so this is not a correctness problem.
            self.rinfo.garbage_data.extend(EOI);
        }

        // beginning here: recovery information (needed for exact JPEG recovery)
        // read further recovery information if any
        loop {
            let mut current_lepton_marker = [0u8; 3];
            match header_reader.read_exact(&mut current_lepton_marker) {
                Ok(_) => {}
                Err(e) => {
                    if e.kind() == ErrorKind::UnexpectedEof {
                        break;
                    } else {
                        return Err(e.into());
                    }
                }
            }

            if buffer_prefix_matches_marker(current_lepton_marker, LEPTON_HEADER_PAD_MARKER) {
                self.rinfo.pad_bit = Some(header_reader.read_u8()?);
            } else if buffer_prefix_matches_marker(
                current_lepton_marker,
                LEPTON_HEADER_JPG_RESTARTS_MARKER,
            ) {
                // CRS marker
                self.rinfo.rst_cnt_set = true;
                let rst_count = header_reader.read_u32::<LittleEndian>()?;

                for _i in 0..rst_count {
                    self.rinfo
                        .rst_cnt
                        .push(header_reader.read_u32::<LittleEndian>()?);
                }
            } else if buffer_prefix_matches_marker(
                current_lepton_marker,
                LEPTON_HEADER_LUMA_SPLIT_MARKER,
            ) {
                // HH markup
                let mut thread_handoffs =
                    ThreadHandoff::deserialize(current_lepton_marker[2], &mut header_reader)?;

                self.thread_handoff.append(&mut thread_handoffs);
            } else if buffer_prefix_matches_marker(
                current_lepton_marker,
                LEPTON_HEADER_JPG_RESTART_ERRORS_MARKER,
            ) {
                // Marker FRS
                // read number of false set RST markers per scan from file
                let rst_err_count = header_reader.read_u32::<LittleEndian>()? as usize;

                let mut rst_err_data = Vec::<u8>::new();
                rst_err_data.resize(rst_err_count, 0);

                header_reader.read_exact(&mut rst_err_data)?;

                self.rinfo.rst_err.append(&mut rst_err_data);
            } else if buffer_prefix_matches_marker(
                current_lepton_marker,
                LEPTON_HEADER_GARBAGE_MARKER,
            ) {
                // GRB marker
                // read garbage (data after end of JPG) from file
                let garbage_size = header_reader.read_u32::<LittleEndian>()? as usize;

                let mut garbage_data_array = Vec::<u8>::new();
                garbage_data_array.resize(garbage_size, 0);

                header_reader.read_exact(&mut garbage_data_array)?;
                self.rinfo.garbage_data = garbage_data_array;
            } else if buffer_prefix_matches_marker(
                current_lepton_marker,
                LEPTON_HEADER_EARLY_EOF_MARKER,
            ) {
                self.rinfo.max_cmp = header_reader.read_u32::<LittleEndian>()?;
                self.rinfo.max_bpos = header_reader.read_u32::<LittleEndian>()?;
                self.rinfo.max_sah = u8::try_from(header_reader.read_u32::<LittleEndian>()?)?;
                self.rinfo.max_dpos[0] = header_reader.read_u32::<LittleEndian>()?;
                self.rinfo.max_dpos[1] = header_reader.read_u32::<LittleEndian>()?;
                self.rinfo.max_dpos[2] = header_reader.read_u32::<LittleEndian>()?;
                self.rinfo.max_dpos[3] = header_reader.read_u32::<LittleEndian>()?;
                self.rinfo.early_eof_encountered = true;
            } else {
                return err_exit_code(ExitCode::BadLeptonFile, "unknown data found");
            }
        }

        // shouldn't be any more data
        let mut remaining_buf = Vec::new();
        let remaining = header_reader.read_to_end(&mut remaining_buf)?;
        assert!(remaining == 0);

        return Ok(hdr_data);
    }

    pub fn write_lepton_header<W: Write>(
        &self,
        writer: &mut W,
        enabled_features: &EnabledFeatures,
    ) -> Result<()> {
        let mut lepton_header = Vec::<u8>::new();

        {
            // Most of the Lepton header data that is compressed before storage
            // The data contains recovery information (needed for exact JPEG recovery)
            let mut mrw = Cursor::new(&mut lepton_header);

            self.write_lepton_jpeg_header(&mut mrw)?;
            self.write_lepton_pad_bit(&mut mrw)?;
            self.write_lepton_luma_splits(&mut mrw)?;
            self.write_lepton_jpeg_restarts_if_needed(&mut mrw)?;
            self.write_lepton_jpeg_restart_errors_if_needed(&mut mrw)?;
            self.write_lepton_early_eof_truncation_data_if_needed(&mut mrw)?;
            self.write_lepton_jpeg_garbage_if_needed(&mut mrw, false)?;
        }

        let mut compressed_header = Vec::<u8>::new(); // we collect a zlib compressed version of the header here
        {
            let mut c = Cursor::new(&mut compressed_header);
            let mut encoder = ZlibEncoder::new(&mut c, Compression::default());

            encoder.write_all(&lepton_header[..]).context()?;
            encoder.finish().context()?;
        }

        writer.write_all(&LEPTON_FILE_HEADER)?;
        writer.write_u8(LEPTON_VERSION)?;

        if self.jpeg_header.jpeg_type == JpegType::Progressive {
            writer.write_all(&LEPTON_HEADER_PROGRESSIVE_JPEG_TYPE)?;
        } else {
            writer.write_all(&LEPTON_HEADER_BASELINE_JPEG_TYPE)?;
        }

        writer.write_u8(self.thread_handoff.len() as u8)?;
        writer.write_all(&[0; 3])?;

        // Original lepton format reserves 12 bytes for git revision. We use this space for additional info
        // to store information about the version that wrote this.
        writer.write_u8('M' as u8)?;
        writer.write_u8('S' as u8)?;

        // write the uncompressed lepton header size
        // (historical, used by a previous version of the decoder)
        writer.write_u32::<LittleEndian>(lepton_header.len() as u32)?;

        // write the flags that were used to encode this file
        writer.write_u8(
            0x80 | if enabled_features.use_16bit_dc_estimate {
                1
            } else {
                0
            } | if enabled_features.use_16bit_adv_predict {
                2
            } else {
                0
            },
        )?;

        // version of the encoder
        writer.write_u8(self.encoder_version)?;

        // write the git revision prefix that was used to write this
        writer.write_all(&self.git_revision_prefix)?;

        writer.write_u32::<LittleEndian>(self.jpeg_file_size)?;
        writer.write_u32::<LittleEndian>(compressed_header.len() as u32)?;
        writer.write_all(&compressed_header[..])?;

        writer.write_all(&LEPTON_HEADER_COMPLETION_MARKER)?;

        Ok(())
    }

    fn write_lepton_jpeg_header<W: Write>(&self, mrw: &mut W) -> Result<()> {
        // write header to file
        // marker: "HDR" + [size of header]
        mrw.write_all(&LEPTON_HEADER_MARKER)?;

        mrw.write_u32::<LittleEndian>(self.rinfo.raw_jpeg_header.len() as u32)?;

        // data: data from header
        mrw.write_all(&self.rinfo.raw_jpeg_header[..])?;

        Ok(())
    }

    fn write_lepton_pad_bit<W: Write>(&self, mrw: &mut W) -> Result<()> {
        // marker: P0D
        mrw.write_all(&LEPTON_HEADER_PAD_MARKER)?;

        // data: this.padBit
        mrw.write_u8(self.rinfo.pad_bit.unwrap_or(0))?;

        Ok(())
    }

    fn write_lepton_luma_splits<W: Write>(&self, mrw: &mut W) -> Result<()> {
        // write luma splits markup HH
        mrw.write_all(&LEPTON_HEADER_LUMA_SPLIT_MARKER)?;

        // data: serialized luma splits
        ThreadHandoff::serialize(&self.thread_handoff, mrw)?;

        Ok(())
    }

    fn write_lepton_jpeg_restarts_if_needed<W: Write>(&self, mrw: &mut W) -> Result<()> {
        if self.rinfo.rst_cnt.len() > 0 {
            // marker: CRS
            mrw.write_all(&LEPTON_HEADER_JPG_RESTARTS_MARKER)?;

            mrw.write_u32::<LittleEndian>(self.rinfo.rst_cnt.len() as u32)?;

            for i in 0..self.rinfo.rst_cnt.len() {
                mrw.write_u32::<LittleEndian>(self.rinfo.rst_cnt[i])?;
            }
        }

        Ok(())
    }

    fn write_lepton_jpeg_restart_errors_if_needed<W: Write>(&self, mrw: &mut W) -> Result<()> {
        // write number of false set RST markers per scan (if available) to file
        if self.rinfo.rst_err.len() > 0 {
            // marker: "FRS" + [number of scans]
            mrw.write_all(&LEPTON_HEADER_JPG_RESTART_ERRORS_MARKER)?;

            mrw.write_u32::<LittleEndian>(self.rinfo.rst_err.len() as u32)?;

            mrw.write_all(&self.rinfo.rst_err[..])?;
        }

        Ok(())
    }

    fn write_lepton_early_eof_truncation_data_if_needed<W: Write>(
        &self,
        mrw: &mut W,
    ) -> Result<()> {
        if self.rinfo.early_eof_encountered {
            // EEE marker
            mrw.write_all(&LEPTON_HEADER_EARLY_EOF_MARKER)?;

            mrw.write_u32::<LittleEndian>(self.rinfo.max_cmp)?;
            mrw.write_u32::<LittleEndian>(self.rinfo.max_bpos)?;
            mrw.write_u32::<LittleEndian>(u32::from(self.rinfo.max_sah))?;
            mrw.write_u32::<LittleEndian>(self.rinfo.max_dpos[0])?;
            mrw.write_u32::<LittleEndian>(self.rinfo.max_dpos[1])?;
            mrw.write_u32::<LittleEndian>(self.rinfo.max_dpos[2])?;
            mrw.write_u32::<LittleEndian>(self.rinfo.max_dpos[3])?;
        }

        Ok(())
    }

    fn write_lepton_jpeg_garbage_if_needed<W: Write>(
        &self,
        mrw: &mut W,
        prefix_garbage: bool,
    ) -> Result<()> {
        // write garbage (if any) to file
        if self.rinfo.garbage_data.len() > 0 {
            // marker: "PGR/GRB" + [size of garbage]
            if prefix_garbage {
                mrw.write_all(&LEPTON_HEADER_PREFIX_GARBAGE_MARKER)?;
            } else {
                mrw.write_all(&LEPTON_HEADER_GARBAGE_MARKER)?;
            }

            mrw.write_u32::<LittleEndian>(self.rinfo.garbage_data.len() as u32)?;
            mrw.write_all(&self.rinfo.garbage_data[..])?;
        }

        Ok(())
    }
}

#[test]
fn test_roundtrip_fixed_header() {
    let test_data = [
        (0, true, true),
        (128, false, false),
        (129, true, false),
        (130, false, true),
        (131, true, true),
    ];
    for (v, dc_16_bit, adv_16_bit) in test_data {
        // test known good version of the header so we can detect breaks
        let fixed_buffer = [
            207, 132, 1, 90, 1, 0, 0, 0, 77, 83, 140, 0, 0, 0, v, 187, 18, 52, 86, 120, 123, 0, 0,
            0, 122, 0, 0, 0,
        ];

        let mut other_enabled_features = EnabledFeatures::compat_lepton_vector_read();

        let mut other = LeptonHeader::default_boxed();
        let compressed_header_size = other
            .read_lepton_fixed_header(&fixed_buffer, &mut other_enabled_features)
            .unwrap();
        assert_eq!(compressed_header_size, 122);
        assert_eq!(other_enabled_features.use_16bit_dc_estimate, dc_16_bit);
        assert_eq!(other_enabled_features.use_16bit_adv_predict, adv_16_bit);
    }

    // test read/write all combinations of the flags
    for (dc_16_bit, adv_16_bit) in [(false, false), (true, false), (false, true), (true, true)] {
        let mut header = make_minimal_lepton_header();
        header.git_revision_prefix = [0x12, 0x34, 0x56, 0x78];
        header.encoder_version = 0xBB;

        let mut enabled_features = EnabledFeatures::compat_lepton_vector_write();
        enabled_features.use_16bit_dc_estimate = dc_16_bit;
        enabled_features.use_16bit_adv_predict = adv_16_bit;

        let (result_header, result_features) = verify_roundtrip(&header, &enabled_features);

        assert_eq!(result_features.use_16bit_dc_estimate, dc_16_bit);
        assert_eq!(result_features.use_16bit_adv_predict, adv_16_bit);
        assert_eq!(
            result_header.git_revision_prefix,
            header.git_revision_prefix
        );
        assert_eq!(result_header.encoder_version, header.encoder_version);
    }
}

// test serializing and deserializing header
#[test]
fn parse_and_write_header() {
    use crate::structs::lepton_header::FIXED_HEADER_SIZE;

    let lh = make_minimal_lepton_header();

    let enabled_features = EnabledFeatures::compat_lepton_vector_write();
    let mut serialized = Vec::new();
    lh.write_lepton_header(&mut Cursor::new(&mut serialized), &enabled_features)
        .unwrap();

    let mut other = LeptonHeader::default_boxed();
    let mut other_reader = Cursor::new(&serialized);

    let mut fixed_buffer = [0; FIXED_HEADER_SIZE];
    other_reader.read_exact(&mut fixed_buffer).unwrap();

    let mut other_enabled_features = EnabledFeatures::compat_lepton_vector_read();

    let compressed_header_size = other
        .read_lepton_fixed_header(&fixed_buffer, &mut other_enabled_features)
        .unwrap();

    other
        .read_compressed_lepton_header(
            &mut other_reader,
            &mut other_enabled_features,
            compressed_header_size,
        )
        .unwrap();

    assert_eq!(
        lh.uncompressed_lepton_header_size,
        other.uncompressed_lepton_header_size
    );

    assert_eq!(lh.git_revision_prefix, other.git_revision_prefix);
    assert_eq!(lh.encoder_version, other.encoder_version);

    assert_eq!(lh.jpeg_file_size, other.jpeg_file_size);
    assert_eq!(lh.rinfo.raw_jpeg_header, other.rinfo.raw_jpeg_header);
    assert_eq!(lh.thread_handoff, other.thread_handoff);
}

#[cfg(test)]
fn make_minimal_lepton_header() -> Box<LeptonHeader> {
    // minimal jpeg that will pass the validity read tests

    use crate::jpeg::jpeg_header::parse_jpeg_header;
    let min_jpeg = [
        0xffu8, 0xe0, // APP0
        0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00,
        0x00, 0xff, 0xdb, // DQT
        0x00, 0x43, 0x00, 0x03, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x02, 0x02, 0x02, 0x03, 0x03,
        0x03, 0x03, 0x04, 0x06, 0x04, 0x04, 0x04, 0x04, 0x04, 0x08, 0x06, 0x06, 0x05, 0x06, 0x09,
        0x08, 0x0a, 0x0a, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0c, 0x0f, 0x0c, 0x0a, 0x0b, 0x0e, 0x0b,
        0x09, 0x09, 0x0d, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x10, 0x0a, 0x0c, 0x12, 0x13,
        0x12, 0x10, 0x13, 0x0f, 0x10, 0x10, 0x10, 0xff, 0xC1, 0x00, 0x0b, 0x08, 0x00,
        0x10, // width
        0x00, 0x10, // height
        0x01, // cmpc
        0x01, // Jid
        0x11, // sfv / sfh
        0x00, 0xff, 0xda, // SOS
        0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3f, 0x00, 0xd2, 0xcf, 0x20, 0xff, 0xd9, // EOI
    ];

    let enabled_features = EnabledFeatures::compat_lepton_vector_read();

    let mut lh = LeptonHeader::default_boxed();
    lh.jpeg_file_size = 123;
    lh.uncompressed_lepton_header_size = Some(156);

    parse_jpeg_header(
        &mut Cursor::new(min_jpeg),
        &enabled_features,
        &mut lh.jpeg_header,
        &mut lh.rinfo,
    )
    .unwrap();
    lh.thread_handoff.push(ThreadHandoff {
        luma_y_start: 0,
        luma_y_end: 1,
        segment_offset_in_file: 0, // not serialized (computed based on segment size)
        segment_size: 500,
        overhang_byte: 0,
        num_overhang_bits: 1,
        last_dc: [1, 2, 3, 0],
    });
    lh.thread_handoff.push(ThreadHandoff {
        luma_y_start: 1,
        luma_y_end: 2,
        segment_offset_in_file: 0,
        segment_size: 600,
        overhang_byte: 1,
        num_overhang_bits: 2,
        last_dc: [2, 3, 4, 0],
    });

    lh
}

#[cfg(test)]
fn verify_roundtrip(
    header: &LeptonHeader,
    enabled_features: &EnabledFeatures,
) -> (Box<LeptonHeader>, EnabledFeatures) {
    let mut output = Vec::new();
    header
        .write_lepton_header(&mut output, &enabled_features)
        .unwrap();

    let mut read_header = LeptonHeader::default_boxed();
    let mut read_enabled_features = EnabledFeatures::compat_lepton_vector_read();

    println!("output: {:?}", &output[0..FIXED_HEADER_SIZE]);

    read_header
        .read_lepton_fixed_header(
            &output[..FIXED_HEADER_SIZE].try_into().unwrap(),
            &mut read_enabled_features,
        )
        .unwrap();
    read_header
        .read_compressed_lepton_header(
            &mut Cursor::new(&output[FIXED_HEADER_SIZE..]),
            &mut read_enabled_features,
            output.len() - FIXED_HEADER_SIZE,
        )
        .unwrap();

    (read_header, read_enabled_features)
}


================================================
FILE: lib/src/structs/mod.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

mod block_context;
mod branch;
mod idct;
mod lepton_decoder;
mod lepton_encoder;
pub mod lepton_file_reader;
pub mod lepton_file_writer;
pub mod lepton_header;
mod model;
pub mod multiplexer;
mod neighbor_summary;
mod partial_buffer;
mod probability_tables;
mod quantization_tables;
mod simple_hash;

pub mod simple_threadpool;

mod thread_handoff;
mod vpx_bool_reader;
mod vpx_bool_writer;

#[cfg(feature = "micro_benchmark")]
pub use idct::benchmark_idct;
#[cfg(feature = "micro_benchmark")]
pub use lepton_encoder::benchmark_roundtrip_coefficient;


================================================
FILE: lib/src/structs/model.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::cmp;
use std::io::{Read, Write};

use default_boxed::DefaultBoxed;

use crate::consts::*;
use crate::helpers::{calc_sign_index, u16_bit_length, u32_bit_length};
use crate::lepton_error::{AddContext, ExitCode, Result, err_exit_code};
use crate::metrics::{ModelComponent, ModelSubComponent};
use crate::structs::branch::Branch;
use crate::structs::quantization_tables::QuantizationTables;
use crate::structs::vpx_bool_reader::VPXBoolReader;
use crate::structs::vpx_bool_writer::VPXBoolWriter;

const BLOCK_TYPES: usize = 2; // setting this to 3 gives us ~1% savings.. 2/3 from BLOCK_TYPES=2

const NUMERIC_LENGTH_MAX: usize = 12;
pub const MAX_EXPONENT: usize = 11; // range from 0 to 1023 requires 11 bins to describe
const COEF_BITS: usize = MAX_EXPONENT - 1; // the MSB of the value is always 1

const NON_ZERO_7X7_COUNT_BITS: usize = 49_usize.ilog2() as usize + 1;
const NON_ZERO_EDGE_COUNT_BITS: usize = 7_usize.ilog2() as usize + 1;
// 0th bin corresponds to 0 non-zeros and therefore is not used for encoding/decoding.
const NUM_NON_ZERO_7X7_BINS: usize = 9;
const NUM_NON_ZERO_EDGE_BINS: usize = 7;

type NumNonZerosCountsT = [[[Branch; 1 << NON_ZERO_EDGE_COUNT_BITS]; 8]; 8];

const RESIDUAL_THRESHOLD_COUNTS_D1: usize = 1 << (1 + RESIDUAL_NOISE_FLOOR);
// The array was used only on indices [2,7] of [0,7]
const RESIDUAL_THRESHOLD_COUNTS_D2: usize = 1 + RESIDUAL_NOISE_FLOOR - 2;
const RESIDUAL_THRESHOLD_COUNTS_D3: usize = 1 << RESIDUAL_NOISE_FLOOR;

#[derive(DefaultBoxed)]
pub struct Model {
    per_color: [ModelPerColor; BLOCK_TYPES],

    counts_dc: [CountsDC; NUMERIC_LENGTH_MAX],
}

impl Model {
    /// Walks through the model and applies the walker function to each branch
    /// This is used by testing to randomize the model so we can detect
    /// any mismatches in the way that updates are handled.
    ///
    /// This is not used in the normal operation of the codec.
    ///
    /// Note: the order of the branch walking must be maintained between the model and the walker,
    /// otherwise you will break the unit tests.
    #[cfg(test)]
    pub fn walk(&mut self, mut walker: impl FnMut(&mut Branch)) {
        for x in self.per_color.iter_mut() {
            for y in x.num_non_zeros_counts7x7.iter_mut() {
                for z in y.iter_mut() {
                    walker(z);
                }
            }

            for y in x.counts.iter_mut() {
                for z in y.iter_mut() {
                    for w in z.exponent_counts.iter_mut() {
                        for q in w.iter_mut() {
                            walker(q);
                        }
                    }

                    for w in z.residual_noise_counts.iter_mut() {
                        walker(w);
                    }
                }
            }

            for y in x.num_non_zeros_counts1x8.iter_mut() {
                for z in y.iter_mut() {
                    for w in z.iter_mut() {
                        walker(w);
                    }
                }
            }

            for y in x.num_non_zeros_counts8x1.iter_mut() {
                for z in y.iter_mut() {
                    for w in z.iter_mut() {
                        walker(w);
                    }
                }
            }

            for y in x.counts_x.iter_mut() {
                for z in y.iter_mut() {
                    for w in z.exponent_counts.iter_mut() {
                        for q in w.iter_mut() {
                            walker(q);
                        }
                    }

                    for w in z.residual_noise_counts.iter_mut() {
                        walker(w);
                    }
                }
            }

            for y in x.residual_threshold_counts.iter_mut() {
                for z in y.iter_mut() {
                    for w in z.iter_mut() {
                        walker(w);
                    }
                }
            }

            for y in x.sign_counts.iter_mut() {
                for z in y.iter_mut() {
                    walker(z);
                }
            }
        }
    }

    /// calculates a checksum of the model so we can compare two models for equality
    #[cfg(test)]
    pub fn model_checksum(&mut self) -> u64 {
        use std::hash::Hasher;

        use siphasher::sip::SipHasher13;

        let mut h = SipHasher13::new();
        self.walk(|x| {
            h.write_u16(x.get_count());
        });

        h.finish()
    }
}

// Arrays are more or less in the order of access.
// Array `residual_noise_counts` is split into 7x7 and edge parts to save memory.
// Some dimensions are exchanged to get lower changing rate outer, lowering cache misses frequency.

#[derive(DefaultBoxed)]
pub struct ModelPerColor {
    // `num_non_zeros_context` cannot exceed 25, see `calc_non_zero_counts_context_7x7`
    num_non_zeros_counts7x7:
        [[Branch; 1 << NON_ZERO_7X7_COUNT_BITS]; 1 + NON_ZERO_TO_BIN[25] as usize],

    counts: [[Counts7x7; 49]; NUM_NON_ZERO_7X7_BINS],

    num_non_zeros_counts1x8: NumNonZerosCountsT,
    num_non_zeros_counts8x1: NumNonZerosCountsT,

    counts_x: [[CountsEdge; 14]; NUM_NON_ZERO_EDGE_BINS],

    residual_threshold_counts: [[[Branch; RESIDUAL_THRESHOLD_COUNTS_D3];
        RESIDUAL_THRESHOLD_COUNTS_D2]; RESIDUAL_THRESHOLD_COUNTS_D1],

    sign_counts: [[Branch; NUMERIC_LENGTH_MAX]; 3],
}

#[derive(DefaultBoxed)]
struct Counts7x7 {
    exponent_counts: [[Branch; MAX_EXPONENT]; NUMERIC_LENGTH_MAX],
    residual_noise_counts: [Branch; COEF_BITS],
}

#[derive(DefaultBoxed)]
struct CountsEdge {
    // predictors for exponents are max 11 bits wide, not 12 since they are clamped
    exponent_counts: [[Branch; MAX_EXPONENT]; MAX_EXPONENT],
    // size by possible range of `min_threshold - 1`
    // that is from 0 up to `bit_width(max(freq_max)) - RESIDUAL_NOISE_FLOOR - 1`
    residual_noise_counts: [Branch; 3],
}

#[derive(DefaultBoxed)]
struct CountsDC {
    exponent_counts: [[Branch; MAX_EXPONENT]; 17],
    residual_noise_counts: [Branch; COEF_BITS],
}

impl ModelPerColor {
    #[inline(never)]
    pub fn read_coef<R: Read>(
        &mut self,
        bool_reader: &mut VPXBoolReader<R>,
        zig49: usize,
        num_non_zeros_bin: usize,
        best_prior_bit_len: usize,
    ) -> std::io::Result<i16> {
        let (exp, sign, bits) =
            self.get_coef_branches(num_non_zeros_bin, zig49, best_prior_bit_len);

        return Model::read_length_sign_coef(
            bool_reader,
            exp,
            sign,
            bits,
            ModelComponent::Coef(ModelSubComponent::Exp),
            ModelComponent::Coef(ModelSubComponent::Sign),
            ModelComponent::Coef(ModelSubComponent::Noise),
        );
    }

    #[inline(never)]
    pub fn write_coef<W: Write>(
        &mut self,
        bool_writer: &mut VPXBoolWriter<W>,
        coef: i16,
        zig49: usize,
        num_non_zeros_bin: usize,
        best_prior_bit_len: usize,
    ) -> Result<()> {
        let (exp, sign, bits) =
            self.get_coef_branches(num_non_zeros_bin, zig49, best_prior_bit_len);

        return Model::write_length_sign_coef(
            bool_writer,
            coef,
            exp,
            sign,
            bits,
            ModelComponent::Coef(ModelSubComponent::Exp),
            ModelComponent::Coef(ModelSubComponent::Sign),
            ModelComponent::Coef(ModelSubComponent::Noise),
        )
        .context();
    }

    #[inline(always)]
    fn get_coef_branches(
        &mut self,
        num_non_zeros_bin: usize,
        zig49: usize,
        best_prior_bit_len: usize,
    ) -> (
        &mut [Branch; MAX_EXPONENT],
        &mut Branch,
        &mut [Branch; COEF_BITS],
    ) {
        // these bounds checks happen anyway, but we can provide more helpful error messages
        // and it also means that the compiler can move the actual array references around
        // if it helps with performance
        assert!(
            num_non_zeros_bin < NUM_NON_ZERO_7X7_BINS,
            "num_non_zeros_bin {0} too high",
            num_non_zeros_bin
        );
        assert!(zig49 < 49, "zig49 {0} too high", num_non_zeros_bin);
        assert!(
            best_prior_bit_len < NUMERIC_LENGTH_MAX,
            "best_prior_bit_len {0} too high",
            best_prior_bit_len
        );

        let exp = &mut self.counts[num_non_zeros_bin][zig49].exponent_counts[best_prior_bit_len];
        let sign = &mut self.sign_counts[0][0];
        let bits = &mut self.counts[num_non_zeros_bin][zig49].residual_noise_counts;

        (exp, sign, bits)
    }

    pub fn write_non_zero_7x7_count<W: Write>(
        &mut self,
        bool_writer: &mut VPXBoolWriter<W>,
        num_non_zeros_7x7_context_bin: u8,
        num_non_zeros_7x7: u8,
    ) -> Result<()> {
        let num_non_zeros_prob =
            &mut self.num_non_zeros_counts7x7[usize::from(num_non_zeros_7x7_context_bin)];

        return bool_writer
            .put_grid(
                num_non_zeros_7x7,
                num_non_zeros_prob,
                ModelComponent::NonZero7x7Count,
            )
            .context();
    }

    pub fn write_non_zero_edge_count<W: Write, const HORIZONTAL: bool>(
        &mut self,
        bool_writer: &mut VPXBoolWriter<W>,
        est_eob: u8,
        num_non_zeros_bin: u8,
        num_non_zeros_edge: u8,
    ) -> Result<()> {
        let prob_edge_eob =
            self.get_non_zero_counts_edge_mut::<HORIZONTAL>(est_eob, num_non_zeros_bin);

        return bool_writer
            .put_grid(
                num_non_zeros_edge,
                prob_edge_eob,
                ModelComponent::NonZeroEdgeCount,
            )
            .context();
    }

    pub fn read_non_zero_7x7_count<R: Read>(
        &mut self,
        bool_reader: &mut VPXBoolReader<R>,
        num_non_zeros_7x7_context_bin: u8,
    ) -> Result<u8> {
        let num_non_zeros_prob =
            &mut self.num_non_zeros_counts7x7[usize::from(num_non_zeros_7x7_context_bin)];

        return Ok(bool_reader
            .get_grid(num_non_zeros_prob, ModelComponent::NonZero7x7Count)
            .context()? as u8);
    }

    pub fn read_non_zero_edge_count<R: Read, const HORIZONTAL: bool>(
        &mut self,
        bool_reader: &mut VPXBoolReader<R>,
        est_eob: u8,
        num_non_zeros_bin: u8,
    ) -> Result<u8> {
        let prob_edge_eob =
            self.get_non_zero_counts_edge_mut::<HORIZONTAL>(est_eob, num_non_zeros_bin);

        return Ok(bool_reader
            .get_grid(prob_edge_eob, ModelComponent::NonZeroEdgeCount)
            .context()? as u8);
    }

    pub fn read_edge_coefficient<R: Read>(
        &mut self,
        bool_reader: &mut VPXBoolReader<R>,
        qt: &QuantizationTables,
        zig15offset: usize,
        num_non_zeros_edge: u8,
        best_prior: i32,
    ) -> Result<i16> {
        let num_non_zeros_edge_bin = usize::from(num_non_zeros_edge) - 1;

        // bounds checks will test these anyway, so check here for better
        // error messages and also gives the optimizer more freedom to move code around
        assert!(
            num_non_zeros_edge_bin < NUM_NON_ZERO_EDGE_BINS,
            "num_non_zeros_edge_bin {0} too high",
            num_non_zeros_edge_bin
        );

        assert!(zig15offset < 14, "zig15offset {0} too high", zig15offset);

        // we cap the bit length since the prior prediction can be wonky
        let best_prior_abs = best_prior.unsigned_abs();
        let best_prior_bit_len =
            cmp::min(MAX_EXPONENT - 1, u32_bit_length(best_prior_abs) as usize);

        let length_branches = &mut self.counts_x[num_non_zeros_edge_bin][zig15offset]
            .exponent_counts[best_prior_bit_len];

        let length = bool_reader
            .get_unary_encoded(
                length_branches,
                ModelComponent::Edge(ModelSubComponent::Exp),
            )
            .context()? as i32;

        let mut coef = 0;
        if length != 0 {
            // best_prior in the initial Lepton implementation is stored as i32,
            // but the sign here is taken from its truncated i16 value
            let sign =
                &mut self.sign_counts[calc_sign_index(best_prior as i16)][best_prior_bit_len];

            let neg = !bool_reader
                .get_bit(sign, ModelComponent::Edge(ModelSubComponent::Sign))
                .context()?;

            coef = 1;

            if length > 1 {
                let min_threshold: i32 = qt.get_min_noise_threshold(zig15offset).into();
                let mut i: i32 = length - 2;

                if i >= min_threshold {
                    let thresh_prob = self.get_residual_threshold_counts_mut(
                        best_prior_abs,
                        min_threshold,
                        length,
                    );

                    let mut decoded_so_far = 1;
                    while i >= min_threshold {
                        let cur_bit = bool_reader.get_bit(
                            &mut thresh_prob[decoded_so_far],
                            ModelComponent::Edge(ModelSubComponent::Residual),
                        )? as i16;

                        coef <<= 1;
                        coef |= cur_bit;

                        // since we are not strict about rejecting jpegs with out of range coefs
                        // we just make those less efficient by reusing the same probability bucket
                        decoded_so_far = cmp::min(coef as usize, thresh_prob.len() - 1);

                        i -= 1;
                    }
                }

                if i >= 0 {
                    let res_prob = &mut self.counts_x[num_non_zeros_edge_bin][zig15offset]
                        .residual_noise_counts;

                    coef <<= i + 1;
                    coef |= bool_reader.get_n_bits(
                        i as usize + 1,
                        res_prob,
                        ModelComponent::Edge(ModelSubComponent::Noise),
                    )? as i16;
                }
            }

            if neg {
                coef = -coef;
            }
        }
        Ok(coef)
    }

    pub fn write_edge_coefficient<W: Write>(
        &mut self,
        bool_writer: &mut VPXBoolWriter<W>,
        qt: &QuantizationTables,
        coef: i16,
        zig15offset: usize,
        num_non_zeros_edge: u8,
        best_prior: i32,
    ) -> Result<()> {
        let num_non_zeros_edge_bin = usize::from(num_non_zeros_edge) - 1;

        // bounds checks will test these anyway, so check here for better
        // error messages and also gives the optimizer more freedom to move code around
        assert!(
            num_non_zeros_edge_bin < NUM_NON_ZERO_EDGE_BINS,
            "num_non_zeros_edge_bin {0} too high",
            num_non_zeros_edge_bin
        );

        assert!(zig15offset < 14, "zig15offset {0} too high", zig15offset);

        // we cap the bit length since the prior prediction can be wonky
        let best_prior_abs = best_prior.unsigned_abs();
        let best_prior_bit_len =
            cmp::min(MAX_EXPONENT - 1, u32_bit_length(best_prior_abs) as usize);

        let abs_coef = coef.unsigned_abs();
        let length = u16_bit_length(abs_coef) as usize;

        let exp_array = &mut self.counts_x[num_non_zeros_edge_bin][zig15offset].exponent_counts
            [best_prior_bit_len];

        if length > MAX_EXPONENT {
            return err_exit_code(ExitCode::CoefficientOutOfRange, "CoefficientOutOfRange");
        }

        bool_writer.put_unary_encoded(
            length,
            exp_array,
            ModelComponent::Edge(ModelSubComponent::Exp),
        )?;

        if coef != 0 {
            // best_prior in the initial Lepton implementation is stored as i32,
            // but the sign here is taken from its truncated i16 value
            let sign =
                &mut self.sign_counts[calc_sign_index(best_prior as i16)][best_prior_bit_len];

            bool_writer.put_bit(
                coef >= 0,
                sign,
                ModelComponent::Edge(ModelSubComponent::Sign),
            )?;

            if length > 1 {
                let min_threshold = i32::from(qt.get_min_noise_threshold(zig15offset));
                let mut i: i32 = length as i32 - 2;

                if i >= min_threshold {
                    let thresh_prob = self.get_residual_threshold_counts_mut(
                        best_prior_abs,
                        min_threshold,
                        length as i32,
                    );

                    let mut encoded_so_far = 1;
                    while i >= min_threshold {
                        let cur_bit = (abs_coef & (1 << i)) != 0;
                        bool_writer.put_bit(
                            cur_bit,
                            &mut thresh_prob[encoded_so_far],
                            ModelComponent::Edge(ModelSubComponent::Residual),
                        )?;

                        encoded_so_far <<= 1;
                        if cur_bit {
                            encoded_so_far |= 1;
                        }

                        // since we are not strict about rejecting jpegs with out of range coefs
                        // we just make those less efficient by reusing the same probability bucket
                        encoded_so_far = cmp::min(encoded_so_far, thresh_prob.len() - 1);

                        i -= 1;
                    }
                }

                if i >= 0 {
                    let res_prob = &mut self.counts_x[num_non_zeros_edge_bin][zig15offset]
                        .residual_noise_counts;

                    bool_writer
                        .put_n_bits(
                            abs_coef as usize,
                            i as usize + 1,
                            res_prob,
                            ModelComponent::Edge(ModelSubComponent::Noise),
                        )
                        .context()?;
                }
            }
        }

        Ok(())
    }

    fn get_residual_threshold_counts_mut(
        &mut self,
        best_prior_abs: u32,
        min_threshold: i32,
        length: i32,
    ) -> &mut [Branch; RESIDUAL_THRESHOLD_COUNTS_D3] {
        // Need to & 0xffff since C++ version casts to a uint16_t in the array lookup
        // and we need to match that behavior. It's unlikely that this will be a problem
        // since it would require an extremely large best_prior, which is difficult
        // due to the range limits of 2047 of the coefficients but still in the
        // interest of correctness we should match the C++ behavior.
        // This function was invoked only with `length - 2 >= min_threshold`,
        // then 2nd array index range can be shortened by 2.
        return &mut self.residual_threshold_counts[cmp::min(
            ((best_prior_abs & 0xffff) >> min_threshold) as usize,
            self.residual_threshold_counts.len() - 1,
        )][cmp::min(
            (length - min_threshold - 2) as usize,
            self.residual_threshold_counts[0].len() - 1,
        )];
    }

    fn get_non_zero_counts_edge_mut<const HORIZONTAL: bool>(
        &mut self,
        est_eob: u8,
        num_nonzeros_bin: u8,
    ) -> &mut [Branch; 8] {
        if HORIZONTAL {
            return &mut self.num_non_zeros_counts8x1[est_eob as usize][num_nonzeros_bin as usize];
        } else {
            return &mut self.num_non_zeros_counts1x8[est_eob as usize][num_nonzeros_bin as usize];
        }
    }
}

impl Model {
    pub fn get_per_color(&mut self, color_index: usize) -> &mut ModelPerColor {
        &mut self.per_color[color_index]
    }

    pub fn read_dc<R: Read>(
        &mut self,
        bool_reader: &mut VPXBoolReader<R>,
        color_index: usize,
        uncertainty: i16,
        uncertainty2: i16,
    ) -> Result<i16> {
        let (exp, sign, bits) = self.get_dc_branches(uncertainty, uncertainty2, color_index);

        return Model::read_length_sign_coef(
            bool_reader,
            exp,
            sign,
            bits,
            ModelComponent::DC(ModelSubComponent::Exp),
            ModelComponent::DC(ModelSubComponent::Sign),
            ModelComponent::DC(ModelSubComponent::Noise),
        )
        .context();
    }

    pub fn write_dc<W: Write>(
        &mut self,
        bool_writer: &mut VPXBoolWriter<W>,
        color_index: usize,
        coef: i16,
        uncertainty: i16,
        uncertainty2: i16,
    ) -> Result<()> {
        let (exp, sign, bits) = self.get_dc_branches(uncertainty, uncertainty2, color_index);

        return Model::write_length_sign_coef(
            bool_writer,
            coef,
            exp,
            sign,
            bits,
            ModelComponent::DC(ModelSubComponent::Exp),
            ModelComponent::DC(ModelSubComponent::Sign),
            ModelComponent::DC(ModelSubComponent::Noise),
        )
        .context();
    }

    #[inline(always)]
    fn get_dc_branches(
        &mut self,
        uncertainty: i16,
        uncertainty2: i16,
        color_index: usize,
    ) -> (
        &mut [Branch; MAX_EXPONENT],
        &mut Branch,
        &mut [Branch; COEF_BITS],
    ) {
        let len_abs_mxm = u16_bit_length(uncertainty.unsigned_abs());
        let len_abs_offset_to_closest_edge = u16_bit_length(uncertainty2.unsigned_abs());
        let len_abs_mxm_clamp = cmp::min(len_abs_mxm as usize, self.counts_dc.len() - 1);

        let exp = &mut self.counts_dc[len_abs_mxm_clamp].exponent_counts
            [len_abs_offset_to_closest_edge as usize];
        let sign =
            &mut self.per_color[color_index].sign_counts[0][calc_sign_index(uncertainty2) + 1]; // +1 to separate from sign_counts[0][0]
        let bits = &mut self.counts_dc[len_abs_mxm_clamp].residual_noise_counts;

        (exp, sign, bits)
    }

    #[inline(always)]
    fn read_length_sign_coef<const A: usize, const B: usize, R: Read>(
        bool_reader: &mut VPXBoolReader<R>,
        magnitude_branches: &mut [Branch; A],
        sign_branch: &mut Branch,
        bits_branch: &mut [Branch; B],
        mag_cmp: ModelComponent,
        sign_cmp: ModelComponent,
        bits_cmp: ModelComponent,
    ) -> std::io::Result<i16> {
        debug_assert!(
            A - 1 <= B,
            "A (max mag) should be not more than B+1 (max bits). A={0} B={1} from {2:?}",
            A,
            B,
            mag_cmp
        );

        let length = bool_reader.get_unary_encoded(magnitude_branches, mag_cmp)?;

        let mut coef: i16 = 0;
        if length != 0 {
            let neg = !bool_reader.get_bit(sign_branch, sign_cmp)?;
            if length > 1 {
                coef = bool_reader.get_n_bits(length - 1, bits_branch, bits_cmp)? as i16;
            }

            coef |= (1 << (length - 1)) as i16;

            if neg {
                coef = -coef;
            }
        }

        return Ok(coef);
    }

    fn write_length_sign_coef<const A: usize, const B: usize, W: Write>(
        bool_writer: &mut VPXBoolWriter<W>,
        coef: i16,
        magnitude_branches: &mut [Branch; A],
        sign_branch: &mut Branch,
        bits_branch: &mut [Branch; B],
        mag_cmp: ModelComponent,
        sign_cmp: ModelComponent,
        bits_cmp: ModelComponent,
    ) -> Result<()> {
        debug_assert!(
            A - 1 <= B,
            "A (max mag) should be not more than B+1 (max bits). A={0} B={1} from {2:?}",
            A,
            B,
            mag_cmp,
        );

        let abs_coef = coef.unsigned_abs();
        let coef_bit_len = u16_bit_length(abs_coef);

        if coef_bit_len > A as u8 {
            return err_exit_code(
                ExitCode::CoefficientOutOfRange,
                "coefficient > MAX_EXPONENT",
            );
        }

        bool_writer.put_unary_encoded(coef_bit_len as usize, magnitude_branches, mag_cmp)?;
        if coef != 0 {
            bool_writer.put_bit(coef > 0, sign_branch, sign_cmp)?;
        }

        if coef_bit_len > 1 {
            debug_assert!(
                (abs_coef & (1 << (coef_bit_len - 1))) != 0,
                "Biggest bit must be set"
            );
            debug_assert!(
                (abs_coef & (1 << coef_bit_len)) == 0,
                "Beyond Biggest bit must be zero"
            );

            bool_writer.put_n_bits(
                abs_coef as usize,
                coef_bit_len as usize - 1,
                bits_branch,
                bits_cmp,
            )?;
        }

        Ok(())
    }
}


================================================
FILE: lib/src/structs/multiplexer.rs
================================================
//! Implements a multiplexer that reads and writes blocks to a stream from multiple partitions. Each
//! partition can run on it own thread to allow for increased parallelism when processing large images.
//!
//! The write implementation identifies the blocks by partition_id and tries to write in 64K blocks. The file
//! ends up with an interleaved stream of blocks from each partition.
//!
//! The read implementation reads the blocks from the file and sends them to the appropriate worker thread
//! for the partition.

use std::cmp;
use std::collections::VecDeque;
use std::io::{Cursor, Read, Write};
use std::mem::swap;
use std::sync::mpsc::{Receiver, Sender, TryRecvError, channel};
use std::sync::{Arc, Mutex};

use byteorder::WriteBytesExt;

use super::simple_threadpool::LeptonThreadPool;

use crate::lepton_error::{AddContext, ExitCode, Result};
use crate::{LeptonError, Metrics};
use crate::{helpers::*, lepton_error::err_exit_code, structs::partial_buffer::PartialBuffer};

/// The message that is sent between the threads
enum Message {
    Eof(usize),
    WriteBlock(usize, Vec<u8>),
}

pub struct MultiplexWriter {
    partition_id: usize,
    sender: Sender<Message>,
    buffer: Vec<u8>,
}

const WRITE_BUFFER_SIZE: usize = 65536;

impl Write for MultiplexWriter {
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        let mut copy_start = 0;
        while copy_start < buf.len() {
            let amount_to_copy = cmp::min(
                WRITE_BUFFER_SIZE - self.buffer.len(),
                buf.len() - copy_start,
            );
            self.buffer
                .extend_from_slice(&buf[copy_start..copy_start + amount_to_copy]);

            if self.buffer.len() == WRITE_BUFFER_SIZE {
                self.flush()?;
            }

            copy_start += amount_to_copy;
        }

        Ok(buf.len())
    }

    fn flush(&mut self) -> std::io::Result<()> {
        if self.buffer.len() > 0 {
            let mut new_buffer = Vec::with_capacity(WRITE_BUFFER_SIZE);
            swap(&mut new_buffer, &mut self.buffer);

            self.sender
                .send(Message::WriteBlock(self.partition_id, new_buffer))
                .unwrap();
        }
        Ok(())
    }
}

/// Collects the thread results and errors and returns them as a vector
struct ThreadResults<RESULT> {
    results: Vec<Receiver<Result<RESULT>>>,
}

impl<RESULT> ThreadResults<RESULT> {
    fn new() -> Self {
        ThreadResults {
            results: Vec::new(),
        }
    }
    /// creates a closure that wraps the passed in closure, catches any panics,
    /// collects the return result and send it to the receiver to collect.
    fn send_results<T: FnOnce() -> Result<RESULT> + Send + 'static>(
        &mut self,
        f: T,
    ) -> impl FnOnce() + use<RESULT, T> {
        let (tx, rx) = channel();

        self.results.push(rx);

        move || {
            let r = catch_unwind_result(f);
            let _ = tx.send(r);
        }
    }

    /// extracts the results from all the receivers and returns them as a vector, or returns an
    /// error if any of the threads errored out.
    fn receive_results(&mut self) -> Result<Vec<RESULT>> {
        let mut final_results = Vec::new();

        let mut error_found = None;
        for r in self.results.drain(..) {
            match r.recv() {
                Ok(Ok(r)) => final_results.push(r),
                Ok(Err(e)) => {
                    error_found = Some(e);
                }
                Err(e) => {
                    // prefer real errors over broken channel errors
                    if error_found.is_none() {
                        error_found = Some(e.into());
                    }
                }
            }
        }

        if let Some(error) = error_found {
            Err(error)
        } else {
            Ok(final_results)
        }
    }
}

/// Given an arbitrary writer, this function will launch the given number of partitions and call the processor function
/// on each of them, and collect the output written by each partition to the writer in blocks identified by the partition_id.
///
/// This output stream can be processed by multiple_read to get the data back, using the same number of threads.
pub fn multiplex_write<WRITE, FN, RESULT>(
    writer: &mut WRITE,
    num_partitions: usize,
    max_processor_threads: usize,
    thread_pool: &dyn LeptonThreadPool,
    processor: FN,
) -> Result<Vec<RESULT>>
where
    WRITE: Write,
    FN: Fn(&mut MultiplexWriter, usize) -> Result<RESULT> + Send + Sync + 'static,
    RESULT: Send + 'static,
{
    let mut thread_results = ThreadResults::new();

    // receives packets from threads as they are generated
    let mut packet_receivers = Vec::new();

    let arc_processor = Arc::new(Box::new(processor));

    let mut work: VecDeque<Box<dyn FnOnce() + Send>> = VecDeque::new();

    for partition_id in 0..num_partitions {
        let (tx, rx) = channel();

        let mut thread_writer = MultiplexWriter {
            partition_id,
            sender: tx,
            buffer: Vec::with_capacity(WRITE_BUFFER_SIZE),
        };

        let processor_clone = arc_processor.clone();

        let f = Box::new(thread_results.send_results(move || {
            let r = processor_clone(&mut thread_writer, partition_id)?;

            thread_writer.flush().context()?;

            thread_writer
                .sender
                .send(Message::Eof(partition_id))
                .context()?;
            Ok(r)
        }));
        work.push_back(f);

        packet_receivers.push(rx);
    }

    drop(arc_processor);

    if thread_pool.max_parallelism() > 1 {
        spawn_processor_threads(thread_pool, max_processor_threads, work);
    } else {
        // single threaded, just run all the work inline, which will
        // fill build up the receiver queue to write the image
        for f in work.drain(..) {
            f();
        }
    }

    // now we have all the threads running, we can write the data to the writer
    // carusel through the threads and write the data to the writer so that they
    // get written in a deterministic order.
    let mut current_thread_writer = 0;
    loop {
        match packet_receivers[current_thread_writer].recv() {
            Ok(Message::WriteBlock(partition_id, b)) => {
                // block length and partition header
                let tid = partition_id as u8;
                let l = b.len() - 1;
                if l == 4095 || l == 16383 || l == 65535 {
                    // length is a special power of 2 - standard block length is 2^16
                    writer.write_u8(tid | ((l.ilog2() as u8 >> 1) - 4) << 4)?;
                } else {
                    writer.write_u8(tid)?;
                    writer.write_u8((l & 0xff) as u8)?;
                    writer.write_u8(((l >> 8) & 0xff) as u8)?;
                }
                // block itself
                writer.write_all(&b[..])?;

                // go to next thread
                current_thread_writer = (current_thread_writer + 1) % packet_receivers.len();
            }
            Ok(Message::Eof(_)) | Err(_) => {
                packet_receivers.remove(current_thread_writer);
                if packet_receivers.len() == 0 {
                    break;
                }

                current_thread_writer = current_thread_writer % packet_receivers.len();
            }
        }
    }

    thread_results.receive_results()
}

/// Used by the processor thread to read data in a blocking way.
/// The partition_id is used only to assert that we are only
/// getting the data that we are expecting.
pub struct MultiplexReader {
    /// the multiplexed thread stream we are processing
    partition_id: usize,

    /// the receiver part of the channel to get more buffers
    receiver: Receiver<Message>,

    /// what we are reading. When this returns zero, we try to
    /// refill the buffer if we haven't reached the end of the stream
    current_buffer: Cursor<Vec<u8>>,

    /// once we get told we are at the end of the stream, we just
    /// always return 0 bytes
    end_of_file: bool,
}

impl Read for MultiplexReader {
    /// fast path for reads. If we run out of data, take the slow path
    #[inline(always)]
    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        let amount_read = self.current_buffer.read(buf)?;
        if amount_read > 0 {
            return Ok(amount_read);
        }

        self.read_slow(buf)
    }
}

impl MultiplexReader {
    /// slow path for reads, try to get a new buffer or
    /// return zero if at the end of the stream
    #[cold]
    #[inline(never)]
    fn read_slow(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        while !self.end_of_file {
            let amount_read = self.current_buffer.read(buf)?;
            if amount_read > 0 {
                return Ok(amount_read);
            }

            match self.receiver.recv() {
                Ok(r) => match r {
                    Message::Eof(_tid) => {
                        self.end_of_file = true;
                    }
                    Message::WriteBlock(tid, block) => {
                        debug_assert_eq!(
                            tid, self.partition_id,
                            "incoming thread must be equal to processing thread"
                        );
                        self.current_buffer = Cursor::new(block);
                    }
                },
                Err(e) => {
                    return std::io::Result::Err(std::io::Error::new(std::io::ErrorKind::Other, e));
                }
            }
        }

        // nothing if we reached the end of file
        return Ok(0);
    }
}

/// Reads data in multiplexed format and sends it to the appropriate processor, each
/// running on its own thread. The processor function is called with the partition_id and
/// a blocking reader that it can use to read its own data.
///
/// Once the multiplexed data is finished reading, we break the channel to the worker threads
/// causing processor that is trying to read from the channel to error out and exit. After all
/// the readers have exited, we collect the results/errors from all the processors and return a vector
/// of the results back to the caller.
pub struct MultiplexReaderState<RESULT> {
    sender_channels: Vec<Sender<Message>>,
    receiver_channels: Vec<Receiver<MultiplexReadResult<RESULT>>>,
    retention_bytes: usize,
    current_state: State,
    single_thread_work: Option<VecDeque<Box<dyn FnOnce() + Send>>>,
    merged_metrics: Metrics,
}

enum State {
    StartBlock,
    U16Length(u8),
    Block(u8, usize),
}

pub enum MultiplexReadResult<RESULT> {
    Result(RESULT),
    Error(LeptonError),
    Complete(Metrics),
}

/// Given a number of threads, this function will create a multiplexed reader state that
/// can be used to process incoming multiplexed data. The processor function is called
/// on each thread with the partition_id and a blocking reader that it can use to read its own data.
///
/// Each processor is also given a sender channel that it can use to send back results or errors.
/// Partial results can be sent back by sending multiple results before the end of file is reached.
///
/// The state object returned can be used to process incoming data and retrieve results/errors
/// from the threads.
pub fn multiplex_read<FN, RESULT>(
    num_partitions: usize,
    max_processor_threads: usize,
    thread_pool: &dyn LeptonThreadPool,
    retention_bytes: usize,
    processor: FN,
) -> MultiplexReaderState<RESULT>
where
    FN: Fn(usize, &mut MultiplexReader, &Sender<MultiplexReadResult<RESULT>>) -> Result<()>
        + Send
        + Sync
        + 'static,
    RESULT: Send + 'static,
{
    let arc_processor = Arc::new(Box::new(processor));

    let mut channel_to_sender = Vec::new();

    // collect the worker threads in a queue so we can spawn them
    let mut work = VecDeque::new();
    let mut result_receiver = Vec::new();

    for partition_id in 0..num_partitions {
        let (tx, rx) = channel::<Message>();
        channel_to_sender.push(tx);

        let cloned_processor = arc_processor.clone();

        let (result_tx, result_rx) = channel::<MultiplexReadResult<RESULT>>();
        result_receiver.push(result_rx);

        let f: Box<dyn FnOnce() + Send> = Box::new(move || {
            // get the appropriate receiver so we can read out data from it
            let mut proc_reader = MultiplexReader {
                partition_id,
                current_buffer: Cursor::new(Vec::new()),
                receiver: rx,
                end_of_file: false,
            };

            if let Err(e) =
                catch_unwind_result(|| cloned_processor(partition_id, &mut proc_reader, &result_tx))
            {
                let _ = result_tx.send(MultiplexReadResult::Error(e));
            }
        });

        work.push_back(f);
    }

    let single_thread_work = if thread_pool.max_parallelism() > 1 {
        spawn_processor_threads(thread_pool, max_processor_threads, work);
        None
    } else {
        Some(work)
    };

    MultiplexReaderState {
        sender_channels: channel_to_sender,
        receiver_channels: result_receiver,
        current_state: State::StartBlock,
        retention_bytes,
        single_thread_work,
        merged_metrics: Metrics::default(),
    }
}

/// spawns the processor threads to handle the work items in the queue. There may be fewer workers
/// than work items.
fn spawn_processor_threads(
    thread_pool: &dyn LeptonThreadPool,
    max_processor_threads: usize,
    work: VecDeque<Box<dyn FnOnce() + Send>>,
) {
    let work_threads = work.len().min(max_processor_threads);
    let shared_queue = Arc::new(Mutex::new(work));

    // spawn the worker threads to process all the items
    // (there may be less processor threads than the number of threads in the image)
    for _i in 0..work_threads {
        let q = shared_queue.clone();

        thread_pool.run(Box::new(move || {
            loop {
                // do this to make sure the lock gets
                let w = q.lock().unwrap().pop_front();

                if let Some(f) = w {
                    f();
                } else {
                    break;
                }
            }
        }));
    }
}

impl<RESULT> MultiplexReaderState<RESULT> {
    /// process as much incoming data as we can and send it to the appropriate thread
    pub fn process_buffer(&mut self, source: &mut PartialBuffer<'_>) -> Result<()> {
        while source.continue_processing() {
            match self.current_state {
                State::StartBlock => {
                    if let Some(a) = source.take_n::<1>(self.retention_bytes) {
                        let thread_marker = a[0];

                        let partition_id = thread_marker & 0xf;

                        if usize::from(partition_id) >= self.sender_channels.len() {
                            return err_exit_code(
                                ExitCode::BadLeptonFile,
                                format!("invalid partition_id {0}", partition_id),
                            );
                        }

                        if thread_marker < 16 {
                            self.current_state = State::U16Length(partition_id);
                        } else {
                            let flags = (thread_marker >> 4) & 3;
                            self.current_state = State::Block(partition_id, 1024 << (2 * flags));
                        }
                    } else {
                        break;
                    }
                }
                State::U16Length(thread_marker) => {
                    if let Some(a) = source.take_n::<2>(self.retention_bytes) {
                        let b0 = usize::from(a[0]);
                        let b1 = usize::from(a[1]);

                        self.current_state = State::Block(thread_marker, (b1 << 8) + b0 + 1);
                    } else {
                        break;
                    }
                }
                State::Block(partition_id, data_length) => {
                    if let Some(a) = source.take(data_length, self.retention_bytes) {
                        // ignore if we get error sending because channel died since we will collect
                        // the error later. We don't want to interrupt the other threads that are processing
                        // so we only get the error from the thread that actually errored out.
                        let tid = usize::from(partition_id);
                        let _ = self.sender_channels[tid].send(Message::WriteBlock(tid, a));
                        self.current_state = State::StartBlock;
                    } else {
                        break;
                    }
                }
            }
        }

        Ok(())
    }

    /// retrieves the next available result from the threads. If complete is true, this function
    /// will block until all threads are complete and return the first result or error it finds.
    /// If complete is false, this function will return immediately if no results are available.
    pub fn retrieve_result(&mut self, complete: bool) -> Result<Option<RESULT>> {
        if let Some(value) =
            Self::try_get_result(&mut self.receiver_channels, &mut self.merged_metrics)?
        {
            return Ok(Some(value));
        }

        if complete {
            // if we are complete, send eof to all threads
            for partition_id in 0..self.sender_channels.len() {
                // send eof to all threads (ignore results since they might be dead already)
                let _ = self.sender_channels[partition_id].send(Message::Eof(partition_id));
            }
            self.sender_channels.clear();

            // if we are running single threaded, now do all the work since we've buffered up everything
            // and broken the sender channels, so there's no danger of deadlock
            if let Some(single_thread_work) = &mut self.single_thread_work {
                while let Some(f) = single_thread_work.pop_front() {
                    f();

                    if let Some(value) =
                        Self::try_get_result(&mut self.receiver_channels, &mut self.merged_metrics)?
                    {
                        return Ok(Some(value));
                    }
                }
            }

            // if we are complete, then walk through all the channels to get the first result by blocking
            while let Some(r) = self.receiver_channels.get_mut(0) {
                match r.recv() {
                    Ok(v) => match v {
                        MultiplexReadResult::Result(v) => return Ok(Some(v)),
                        MultiplexReadResult::Error(e) => return Err(e),
                        MultiplexReadResult::Complete(m) => {
                            // finished, so remove it and try the next one
                            self.merged_metrics.merge_from(m);
                            self.receiver_channels.remove(0);
                        }
                    },
                    Err(e) => {
                        // channel is closed unexpectedly, clear out all channels and return error
                        self.receiver_channels.clear();
                        return Err(e.into());
                    }
                }
            }
        }
        // nothing left to read
        Ok(None)
    }

    /// tries to get a result from the receiver channels without blocking
    fn try_get_result(
        receiver_channels: &mut Vec<Receiver<MultiplexReadResult<RESULT>>>,
        metrics: &mut Metrics,
    ) -> Result<Option<RESULT>> {
        // if we aren't complete, use non-blocking to try to get some results
        // from the first thread
        while let Some(r) = receiver_channels.get_mut(0) {
            match r.try_recv() {
                Ok(v) => match v {
                    MultiplexReadResult::Result(v) => return Ok(Some(v)),
                    MultiplexReadResult::Error(e) => return Err(e),
                    MultiplexReadResult::Complete(m) => {
                        // finished, so remove it and try the next one
                        metrics.merge_from(m);
                        receiver_channels.remove(0);
                    }
                },
                Err(TryRecvError::Disconnected) => {
                    // finished, so remove it and try the next one
                    return Err(LeptonError::new(
                        ExitCode::AssertionFailure,
                        "multiplexed reader channel disconnected unexpectedly",
                    ));
                }
                Err(TryRecvError::Empty) => {
                    // no result yet, exit loop without result
                    break;
                }
            }
        }
        Ok(None)
    }

    /// takes the merged metrics from all the threads
    pub fn take_metrics(&mut self) -> Metrics {
        std::mem::take(&mut self.merged_metrics)
    }
}

#[cfg(test)]
mod tests {
    use std::time::Duration;

    use byteorder::ReadBytesExt;

    use super::*;
    use crate::lepton_error::{ExitCode, LeptonError};
    use crate::{DEFAULT_THREAD_POOL, SingleThreadPool};

    /// simple end to end test that write the thread id and reads it back
    #[test]
    fn test_multiplex_end_to_end() {
        let mut output = Vec::new();

        let w = multiplex_write(
            &mut output,
            10,
            10,
            &DEFAULT_THREAD_POOL,
            |writer, partition_id| -> Result<usize> {
                for i in partition_id as u32..10000 {
                    writer.write_u32::<byteorder::LittleEndian>(i)?;
                }

                Ok(partition_id)
            },
        )
        .unwrap();

        assert_eq!(w[..], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);

        for max_processor_threads in 1..=10 {
            test_read(&output, &w, max_processor_threads);
        }
    }

    fn test_read(output: &[u8], w: &[usize], max_processor_threads: usize) {
        let mut extra = Vec::new();
        let single = SingleThreadPool::default();

        let mut multiplex_state = multiplex_read(
            10,
            max_processor_threads,
            if max_processor_threads == 1 {
                // for a single thread we shouldn't spawn any threads
                &single
            } else {
                &DEFAULT_THREAD_POOL
            },
            0,
            |partition_id, reader, result_tx: &Sender<MultiplexReadResult<usize>>| {
                for i in partition_id as u32..10000 {
                    let read_partition_id = reader.read_u32::<byteorder::LittleEndian>()?;
                    assert_eq!(read_partition_id, i);
                }
                result_tx.send(MultiplexReadResult::Result(partition_id))?;

                let mut metrics = Metrics::default();
                metrics.record_cpu_worker_time(Duration::new(1, 0));

                result_tx.send(MultiplexReadResult::Complete(metrics))?;
                Ok(())
            },
        );

        // do worst case, we are just given byte at a time
        let mut r = Vec::new();

        for i in 0..output.len() {
            let mut i = PartialBuffer::new(&output[i..=i], &mut extra);
            multiplex_state.process_buffer(&mut i).unwrap();

            if let Some(res) = multiplex_state.retrieve_result(false).unwrap() {
                r.push(res);
            }
        }

        while let Some(res) = multiplex_state.retrieve_result(true).unwrap() {
            r.push(res);
        }

        let metrics = multiplex_state.take_metrics();
        assert_eq!(metrics.get_cpu_time_worker_time(), Duration::new(10, 0));

        assert_eq!(r[..], w[..]);
    }

    #[test]
    fn test_multiplex_read_error() {
        let mut multiplex_state = multiplex_read(
            10,
            10,
            &DEFAULT_THREAD_POOL,
            0,
            |_, _, _: &Sender<MultiplexReadResult<()>>| -> Result<()> {
                Err(LeptonError::new(ExitCode::FileNotFound, "test error"))?
            },
        );

        let e: LeptonError = multiplex_state.retrieve_result(true).unwrap_err().into();
        assert_eq!(e.exit_code(), ExitCode::FileNotFound);
        assert!(e.message().starts_with("test error"));
    }

    #[test]
    fn test_multiplex_read_panic() {
        let mut multiplex_state = multiplex_read(
            10,
            10,
            &DEFAULT_THREAD_POOL,
            0,
            |_, _, _: &Sender<MultiplexReadResult<()>>| -> Result<()> {
                panic!();
            },
        );

        let e: LeptonError = multiplex_state.retrieve_result(true).unwrap_err().into();
        assert_eq!(e.exit_code(), ExitCode::AssertionFailure);
    }

    // test catching errors in the multiplex_write function
    #[test]
    fn test_multiplex_write_error() {
        let mut output = Vec::new();

        let e: LeptonError = multiplex_write(
            &mut output,
            10,
            10,
            &DEFAULT_THREAD_POOL,
            |_, partition_id| -> Result<usize> {
                if partition_id == 3 {
                    // have one partition fail
                    Err(LeptonError::new(ExitCode::FileNotFound, "test error"))?
                } else {
                    Ok(0)
                }
            },
        )
        .unwrap_err()
        .into();

        assert_eq!(e.exit_code(), ExitCode::FileNotFound);
        assert!(e.message().starts_with("test error"));
    }

    // test catching errors in the multiplex_write function
    #[test]
    fn test_multiplex_write_panic() {
        let mut output = Vec::new();

        let e: LeptonError = multiplex_write(
            &mut output,
            10,
            10,
            &DEFAULT_THREAD_POOL,
            |_, partition_id| -> Result<usize> {
                if partition_id == 5 {
                    panic!();
                }
                Ok(0)
            },
        )
        .unwrap_err()
        .into();

        assert_eq!(e.exit_code(), ExitCode::AssertionFailure);
    }
}


================================================
FILE: lib/src/structs/neighbor_summary.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::num::Wrapping;

use wide::{i16x8, i32x8};

#[derive(Copy, Clone, PartialEq, Debug)]
pub struct NeighborSummary {
    edge_pixels_h: i16x8,
    edge_pixels_v: i16x8,

    edge_coefs_h: i32x8,
    edge_coefs_v: i32x8,

    num_non_zeros: u8,
}

pub static NEIGHBOR_DATA_EMPTY: NeighborSummary = NeighborSummary {
    edge_pixels_h: i16x8::ZERO,
    edge_pixels_v: i16x8::ZERO,
    edge_coefs_h: i32x8::ZERO,
    edge_coefs_v: i32x8::ZERO,
    num_non_zeros: 0,
};

impl Default for NeighborSummary {
    fn default() -> Self {
        NEIGHBOR_DATA_EMPTY
    }
}

impl NeighborSummary {
    #[inline(always)]
    pub fn new(
        edge_pixels_h: i16x8,
        edge_pixels_v: i16x8,
        dc_deq: i32,
        num_non_zeros_7x7: u8,
        horiz_pred: i32x8,
        vert_pred: i32x8,
    ) -> Self {
        NeighborSummary {
            edge_pixels_h: edge_pixels_h + (dc_deq as i16),
            edge_pixels_v: edge_pixels_v + (dc_deq as i16),
            edge_coefs_h: horiz_pred,
            edge_coefs_v: vert_pred,
            num_non_zeros: num_non_zeros_7x7,
        }
    }

    pub fn get_num_non_zeros(&self) -> u8 {
        self.num_non_zeros
    }

    pub fn get_vertical_pix(&self) -> i16x8 {
        return self.edge_pixels_v;
    }

    pub fn get_horizontal_pix(&self) -> i16x8 {
        return self.edge_pixels_h;
    }

    pub fn get_vertical_coef(&self) -> i32x8 {
        return self.edge_coefs_v;
    }

    pub fn get_horizontal_coef(&self) -> i32x8 {
        return self.edge_coefs_h;
    }

    // used for debugging
    #[allow(dead_code)]
    pub fn checksum(&self) -> u32 {
        let mut sum: Wrapping<u32> =
            Wrapping(i32x8::from_i16x8(self.edge_pixels_h).reduce_add() as u32);
        sum += Wrapping(i32x8::from_i16x8(self.edge_pixels_v).reduce_add() as u32);
        sum += Wrapping(self.num_non_zeros as u32);
        return sum.0;
    }
}


================================================
FILE: lib/src/structs/partial_buffer.rs
================================================
use std::cmp::min;

/// This struct is used to the fact that we have to take buffers
/// as they arrive, and we might not have all the data we need.
///
/// This used is via the take function, which attempts to grab
/// the amount of data specified, and if it isn't available, stores
/// the partial data in the extra buffer, and returns None.
///
/// Next time around, the extra data will be prepended to the next
/// buffer, so eventually the amount of data requested will become
/// available.
///
/// The concept of retention_bytes is used to handle the case where we need
/// to leave a certain amount of data in the buffer, perticularly
/// where Lepton files have the 32bit file size appended.
///
/// We don't want this to get parsed out, so we ensure that there are
/// always at least 4 bytes in the buffer.
pub struct PartialBuffer<'a> {
    slice: &'a [u8],
    extra_buffer: &'a mut Vec<u8>,
    continue_processing: bool,
}

impl<'a> PartialBuffer<'a> {
    /// Instantiates a new buffer with a slice and a place to store extra data
    /// between calls.
    ///
    /// Extra data is used both to remember extra data from the previous buffer
    /// and is updated with any data that is left over after a take call.
    pub fn new(slice: &'a [u8], extra_buffer: &'a mut Vec<u8>) -> PartialBuffer<'a> {
        PartialBuffer {
            slice,
            extra_buffer,
            continue_processing: true,
        }
    }

    /// returns true if we haven't yet run out of data (ie take returned empty)
    pub fn continue_processing(&self) -> bool {
        self.continue_processing
    }

    /// Attempts to get "size" bytes of data from the buffer. If that much
    /// is available (including the extra buffer from the previous call), it is
    /// returned as a vector exactly that size, otherwise the data is appended
    /// to the extra buffer and None is returned.
    ///
    /// retention_bytes (see comment at top of file) indicates that we should never
    /// consume the last x bytes of the buffer. This is useful because of the particular
    /// way that Lepton files are encoded, the file size is appended without any sort
    /// of header or marker, so the only way to know we are at the end is if there
    /// are only 4 bytes left.
    pub fn take(&mut self, size: usize, retention_bytes: usize) -> Option<Vec<u8>> {
        if self.extra_buffer.len() + self.slice.len() < size + retention_bytes {
            self.extra_buffer.extend_from_slice(self.slice);
            self.slice = &[];
            self.continue_processing = false;
            return None;
        }

        let mut retval = Vec::with_capacity(size);
        let amount_from_extra = min(self.extra_buffer.len(), size);
        if amount_from_extra > 0 {
            retval.extend_from_slice(&self.extra_buffer[0..amount_from_extra]);
            self.extra_buffer.drain(0..amount_from_extra);
        }

        let amount_from_slice = size - amount_from_extra;
        if amount_from_slice > 0 {
            retval.extend_from_slice(&self.slice[0..amount_from_slice]);
            self.slice = &self.slice[amount_from_slice..];
        }

        debug_assert!(retval.len() == size);
        return Some(retval);
    }

    /// Same as take, except returns a fixed size array instead of a vector.
    ///
    /// Useful when we are expecting a small fixed number of bytes like a header
    /// or signature.
    pub fn take_n<const N: usize>(&mut self, retention_bytes: usize) -> Option<[u8; N]> {
        if self.extra_buffer.len() + self.slice.len() < N + retention_bytes {
            self.extra_buffer.extend_from_slice(self.slice);
            self.slice = &[];
            self.continue_processing = false;
            return None;
        }

        let mut retval = [0; N];
        let amount_from_extra = min(self.extra_buffer.len(), N);
        if amount_from_extra > 0 {
            retval[0..amount_from_extra].copy_from_slice(&self.extra_buffer[0..amount_from_extra]);
            self.extra_buffer.drain(0..amount_from_extra);
        }

        let amount_from_slice = N - amount_from_extra;
        if amount_from_slice > 0 {
            retval[amount_from_extra..N].copy_from_slice(&self.slice[0..amount_from_slice]);
            self.slice = &self.slice[amount_from_slice..];
        }

        Some(retval)
    }
}

#[test]
fn test_taking_simple() {
    let mut extra = Vec::new();
    let mut pb = PartialBuffer::new(&[1, 2, 3, 4], &mut extra);

    let taken = pb.take(4, 0).unwrap();
    assert_eq!(taken, vec![1, 2, 3, 4]);
    assert_eq!(&extra[..], []);
}

#[test]
fn test_taking_simple_n() {
    let mut extra = Vec::new();
    let mut pb = PartialBuffer::new(&[1, 2, 3, 4], &mut extra);

    let taken = pb.take_n::<4>(0).unwrap();
    assert_eq!(taken, [1, 2, 3, 4]);
    assert_eq!(&extra[..], []);
}

#[test]
fn test_taking_extra() {
    let mut extra = Vec::new();
    let mut pb = PartialBuffer::new(&[1, 2, 3, 4], &mut extra);

    // try to take 5 characters, but there are only 4, so it should return None and
    // leave the data read in extra
    assert_eq!(pb.take(5, 0), None);
    assert_eq!(&extra, &vec![1, 2, 3, 4]);

    // now we should be able to take the 4 characters
    let mut pb = PartialBuffer::new(&[5, 6, 7, 8], &mut extra);

    assert_eq!(pb.take(5, 0), Some(vec![1, 2, 3, 4, 5]));

    // try to take another 5, but there aren't
    assert_eq!(pb.take(5, 0), None);

    // the 3 characters we couldn't get should be in extra
    assert!(!pb.continue_processing());
    assert_eq!(&extra, &vec![6, 7, 8]);
}

#[test]
fn test_taking_extra_n() {
    let mut extra = Vec::new();
    let mut pb = PartialBuffer::new(&[1, 2, 3, 4], &mut extra);

    // try to take 5 characters, but there are only 4, so it should return None and
    // leave the data read in extra
    assert_eq!(pb.take_n::<5>(0), None);
    assert_eq!(&extra, &vec![1, 2, 3, 4]);

    // now we should be able to take the 4 characters
    let mut pb = PartialBuffer::new(&[5, 6, 7, 8], &mut extra);

    assert_eq!(pb.take_n::<5>(0), Some([1, 2, 3, 4, 5]));

    // try to take another 5, but there aren't
    assert_eq!(pb.take_n::<5>(0), None);

    // the 3 characters we couldn't get should be in extra
    assert!(!pb.continue_processing());
    assert_eq!(&extra, &vec![6, 7, 8]);
}

#[test]
fn test_taking_reserve() {
    let mut extra = Vec::new();
    let mut pb = PartialBuffer::new(&[1, 2, 3, 4, 5], &mut extra);

    // taking 5 should fail because we wanted a reserve
    assert_eq!(pb.take(5, 1), None);

    let mut pb = PartialBuffer::new(&[], &mut extra);
    assert_eq!(pb.take(5, 0), Some(vec![1, 2, 3, 4, 5]));
}


================================================
FILE: lib/src/structs/probability_tables.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use bytemuck::cast;
use wide::{i16x8, i32x8, u16x8};

use crate::consts::*;
use crate::enabled_features;
use crate::jpeg::block_based_image::AlignedBlock;
use crate::lepton_error::err_exit_code;
use crate::structs::block_context::NeighborData;
use crate::structs::idct::*;
use crate::structs::model::*;
use crate::structs::quantization_tables::*;
use crate::{ExitCode, Result};

pub struct ProbabilityTables {
    left_present: bool,
    above_present: bool,
    all_present: bool,
}

pub static NO_NEIGHBORS: ProbabilityTables = ProbabilityTables::new(false, false);
pub static TOP_ONLY: ProbabilityTables = ProbabilityTables::new(false, true);
pub static LEFT_ONLY: ProbabilityTables = ProbabilityTables::new(true, false);
pub static ALL: ProbabilityTables = ProbabilityTables::new(true, true);

pub struct PredictDCResult {
    pub predicted_dc: i32,
    pub uncertainty: i16,
    pub uncertainty2: i16,
    pub next_edge_pixels_h: i16x8,
    pub next_edge_pixels_v: i16x8,
}

impl ProbabilityTables {
    pub const fn new(in_left_present: bool, in_above_present: bool) -> ProbabilityTables {
        return ProbabilityTables {
            left_present: in_left_present,
            above_present: in_above_present,
            all_present: in_left_present && in_above_present,
        };
    }

    pub fn is_all_present(&self) -> bool {
        self.all_present
    }

    pub fn is_left_present(&self) -> bool {
        self.left_present
    }
    pub fn is_above_present(&self) -> bool {
        self.above_present
    }

    pub fn adv_predict_or_unpredict_dc(
        saved_dc: i16,
        recover_original: bool,
        predicted_val: i32,
    ) -> i32 {
        let max_value = 1 << (MAX_EXPONENT - 1);
        let min_value = -max_value;
        let adjustment_factor = (2 * max_value) + 1;
        let mut retval = predicted_val;
        retval = saved_dc as i32 + if recover_original { retval } else { -retval };

        if retval < min_value {
            retval += adjustment_factor;
        }

        if retval > max_value {
            retval -= adjustment_factor;
        }

        return retval;
    }

    pub fn get_color_index(component: usize) -> usize {
        return if component == 0 { 0 } else { 1 };
    }

    #[inline(always)]
    pub fn num_non_zeros_to_bin_7x7(num_non_zeros: usize) -> usize {
        return usize::from(NON_ZERO_TO_BIN_7X7[num_non_zeros]);
    }

    pub fn calc_num_non_zeros_7x7_context_bin<const ALL_PRESENT: bool>(
        &self,
        neighbor_data: &NeighborData,
    ) -> u8 {
        let mut num_non_zeros_above = 0;
        let mut num_non_zeros_left = 0;
        if ALL_PRESENT || self.above_present {
            num_non_zeros_above = neighbor_data.neighbor_context_above.get_num_non_zeros();
        }

        if ALL_PRESENT || self.left_present {
            num_non_zeros_left = neighbor_data.neighbor_context_left.get_num_non_zeros();
        }

        let num_non_zeros_context;
        if (!ALL_PRESENT) && self.above_present && !self.left_present {
            num_non_zeros_context = (num_non_zeros_above + 1) / 2;
        } else if (!ALL_PRESENT) && self.left_present && !self.above_present {
            num_non_zeros_context = (num_non_zeros_left + 1) / 2;
        } else if ALL_PRESENT || (self.left_present && self.above_present) {
            num_non_zeros_context = (num_non_zeros_above + num_non_zeros_left + 2) / 4;
        } else {
            num_non_zeros_context = 0;
        }

        return NON_ZERO_TO_BIN[usize::from(num_non_zeros_context)];
    }

    // calculates the average of the prior values from their corresponding value in the left, above and above/left block
    // the C++ version does one coefficient at a time, but if we do it all at the same time, the compiler vectorizes everything
    #[inline(never)]
    pub fn calc_coefficient_context_7x7_aavg_block<const ALL_PRESENT: bool>(
        &self,
        left: &AlignedBlock,
        above: &AlignedBlock,
        above_left: &AlignedBlock,
    ) -> [u16; 64] {
        let mut best_prior = [u16x8::ZERO; 8];

        if ALL_PRESENT {
            for i in 1..8 {
                // approximate average of 3 without a divide with double the weight for left/top vs diagonal
                //
                // No need to go to 32 bits since max exponent is 11, ie 2047, so
                // (2047 + 2047) * 13 + 2047 * 6 = 65504 which still fits in 16 bits.
                // In addition, if we ever returned anything higher that 2047, it would
                // assert in the array lookup in the model.
                best_prior[i] =
                    ((left.as_i16x8(i).unsigned_abs() + above.as_i16x8(i).unsigned_abs()) * 13
                        + above_left.as_i16x8(i).unsigned_abs() * 6)
                        >> 5;
            }
        } else {
            // handle edge case :) where we are on the top or left edge

            if self.left_present {
                for i in 1..8 {
                    best_prior[i] = left.as_i16x8(i).unsigned_abs();
                }
            } else if self.above_present {
                for i in 1..8 {
                    best_prior[i] = above.as_i16x8(i).unsigned_abs();
                }
            }
        }

        cast(best_prior)
    }

    // Predictor calculations in `compute_lak` are made using partial IDCT along only one dimension
    // on neighbor and current blocks row/column and finding predictor that makes current block edge
    // "almost-pixel" equal to that of neighbor block (see https://arxiv.org/abs/1704.06192, section A.2.2).
    // These 1D IDCT can be conveniently done separately for current block and neighbor one
    // storing components of predictor formula - dot products of dequantized DCT coefficients columns/rows
    // with `ICOS_BASED_8192_SCALED/_PM` (equivalent to former dot products of quantized DCT coefficients
    // with `icos_idct_edge_8192_dequantized_x/y`) - inside `NeighborSummary` of corresponding block.
    // Instead of non-continuous memory accesses to blocks we can use dequantized raster DCT coefficients
    // needed for DC prediction and apply horizontal SIMD instructions for direction along the raster order.

    // Produce current block predictors for edge DCT coefficients
    #[inline(always)]
    pub fn predict_current_edges(
        neighbors_data: &NeighborData,
        raster: &[i32x8; 8],
    ) -> (i32x8, i32x8) {
        // don't bother about DC in encoding - 0th component of ICOS_BASED_8192_SCALED is 0
        let mult: i32x8 = i32x8::from(ICOS_BASED_8192_SCALED);

        // load initial predictors data from neighborhood blocks
        let mut horiz_pred: [i32; 8] = neighbors_data
            .neighbor_context_above
            .get_horizontal_coef()
            .to_array();
        let mut vert_pred: i32x8 = neighbors_data.neighbor_context_left.get_vertical_coef();

        for col in 1..8 {
            // some extreme coefficents can cause overflows, but since this is just predictors, no need to panic
            vert_pred -= raster[col] * ICOS_BASED_8192_SCALED[col];
            horiz_pred[col] = horiz_pred[col].wrapping_sub((raster[col] * mult).reduce_add());
        }

        (i32x8::from(horiz_pred), vert_pred)
    }

    // Produce first part of edge DCT coefficients predictions for neighborhood blocks
    #[inline(always)]
    pub fn predict_next_edges(raster: &[i32x8; 8]) -> (i32x8, i32x8) {
        let mult = i32x8::from(ICOS_BASED_8192_SCALED_PM);

        let mut horiz_pred: [i32; 8] = [0; 8];
        let mut vert_pred = ICOS_BASED_8192_SCALED_PM[0] * raster[0];
        for col in 1..8 {
            // produce predictions for edge DCT coefficientss for the block below
            horiz_pred[col] = (mult * raster[col]).reduce_add();
            // and for the block to the right
            vert_pred += ICOS_BASED_8192_SCALED_PM[col] * raster[col];
        }

        (i32x8::from(horiz_pred), vert_pred)
    }

    #[inline(always)]
    pub fn calc_coefficient_context8_lak<const ALL_PRESENT: bool, const HORIZONTAL: bool>(
        &self,
        qt: &QuantizationTables,
        coefficient_tr: usize,
        pred: &[i32; 8],
    ) -> Result<i32> {
        if !ALL_PRESENT
            && ((HORIZONTAL && !self.above_present) || (!HORIZONTAL && !self.left_present))
        {
            return Ok(0);
        }

        let best_prior: i32 = pred[if HORIZONTAL {
            coefficient_tr >> 3
        } else {
            coefficient_tr
        }];

        let div = (qt.get_quantization_table_transposed()[coefficient_tr] as i32) << 13;
        if let Some(x) = best_prior.checked_div(div) {
            return Ok(x);
        } else {
            return err_exit_code(
                ExitCode::UnsupportedJpegWithZeroIdct0,
                "integer overflow in coefficient context calculation",
            );
        }
    }

    pub fn adv_predict_dc_pix<const ALL_PRESENT: bool>(
        &self,
        raster_cols: &[i32x8; 8],
        q0: i32,
        neighbor_data: &NeighborData,
        enabled_features: &enabled_features::EnabledFeatures,
    ) -> PredictDCResult {
        // here DC in raster_cols should be 0
        let pixels_sans_dc = run_idct(raster_cols);

        // helper functions to avoid code duplication that calculate prediction values
        #[inline]
        fn calc_pred(a1: i16x8, a2: i16x8, is_16_bit: bool) -> i16x8 {
            if is_16_bit {
                let pixel_delta = a1 - a2;
                let half_delta = (pixel_delta - (pixel_delta >> 15)) >> 1; /* divide pixel_delta by 2 rounding towards 0 */

                a1 + half_delta
            } else {
                let a1 = i32x8::from_i16x8(a1);
                let a2 = i32x8::from_i16x8(a2);
                let pixel_delta = a1 - a2;
                let half_delta = (pixel_delta - (pixel_delta >> 31)) >> 1; /* divide pixel_delta by 2 rounding towards 0 */
                let result = a1 + half_delta;

                i16x8::from_i32x8_truncate(result)
            }
        }

        let a1 = pixels_sans_dc.as_i16x8(0);
        let a2 = pixels_sans_dc.as_i16x8(1);
        let v_pred = calc_pred(a1, a2, enabled_features.use_16bit_adv_predict);

        let a1 = pixels_sans_dc.from_stride(0, 8);
        let a2 = pixels_sans_dc.from_stride(1, 8);
        let h_pred = calc_pred(a1, a2, enabled_features.use_16bit_adv_predict);

        let a1 = pixels_sans_dc.as_i16x8(7);
        let a2 = pixels_sans_dc.as_i16x8(6);
        let next_edge_pixels_v = calc_pred(a1, a2, enabled_features.use_16bit_dc_estimate);

        let a1 = pixels_sans_dc.from_stride(7, 8);
        let a2 = pixels_sans_dc.from_stride(6, 8);

        let next_edge_pixels_h = calc_pred(a1, a2, enabled_features.use_16bit_dc_estimate);

        let min_dc;
        let max_dc;
        let mut avg_horizontal: i32;
        let mut avg_vertical: i32;

        if ALL_PRESENT {
            // most common case where we have both left and above
            let horiz = neighbor_data.neighbor_context_left.get_horizontal_pix() - h_pred;
            let vert = neighbor_data.neighbor_context_above.get_vertical_pix() - v_pred;

            min_dc = horiz.min(vert).reduce_min();
            max_dc = horiz.max(vert).reduce_max();

            avg_horizontal = i32x8::from_i16x8(horiz).reduce_add();
            avg_vertical = i32x8::from_i16x8(vert).reduce_add();
        } else if self.left_present {
            let horiz = neighbor_data.neighbor_context_left.get_horizontal_pix() - h_pred;
            min_dc = horiz.reduce_min();
            max_dc = horiz.reduce_max();

            avg_horizontal = i32x8::from_i16x8(horiz).reduce_add();
            avg_vertical = avg_horizontal;
        } else if self.above_present {
            let vert = neighbor_data.neighbor_context_above.get_vertical_pix() - v_pred;
            min_dc = vert.reduce_min();
            max_dc = vert.reduce_max();

            avg_vertical = i32x8::from_i16x8(vert).reduce_add();
            avg_horizontal = avg_vertical;
        } else {
            return PredictDCResult {
                predicted_dc: 0,
                uncertainty: 0,
                uncertainty2: 0,
                next_edge_pixels_h,
                next_edge_pixels_v,
            };
        }

        let avgmed: i32 = (avg_vertical + avg_horizontal) >> 1;
        let uncertainty_val = ((i32::from(max_dc) - i32::from(min_dc)) >> 3) as i16;
        avg_horizontal -= avgmed;
        avg_vertical -= avgmed;

        let mut far_afield_value = avg_vertical;
        if avg_horizontal.abs() < avg_vertical.abs() {
            far_afield_value = avg_horizontal;
        }

        let uncertainty2_val = (far_afield_value >> 3) as i16;

        return PredictDCResult {
            predicted_dc: (avgmed / q0 + 4) >> 3,
            uncertainty: uncertainty_val,
            uncertainty2: uncertainty2_val,
            next_edge_pixels_h,
            next_edge_pixels_v,
        };
    }
}


================================================
FILE: lib/src/structs/quantization_tables.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use crate::Result;
use crate::consts::*;
use crate::helpers::*;
use crate::jpeg::jpeg_header::JpegHeader;

pub struct QuantizationTables {
    quantization_table: [u16; 64],
    quantization_table_transposed: [u16; 64],
    // Values for discrimination between "regular" and "noise" part of
    // edge AC coefficients, used in `read/write_edge_coefficient`.
    // Calculated using approximate maximal magnitudes
    // of these coefficients `FREQ_MAX`
    min_noise_threshold: [u8; 14],
}

impl QuantizationTables {
    pub fn new(jpeg_header: &JpegHeader, component: usize) -> Self {
        Self::new_from_table(
            &jpeg_header.q_tables[usize::from(jpeg_header.cmp_info[component].q_table_index)],
        )
    }

    pub fn new_from_table(quantization_table: &[u16; 64]) -> Self {
        let mut retval = QuantizationTables {
            quantization_table: [0; 64],
            quantization_table_transposed: [0; 64],
            min_noise_threshold: [0; 14],
        };

        for pixel_row in 0..8 {
            for pixel_column in 0..8 {
                let coord = (pixel_row * 8) + pixel_column;
                let coord_tr = (pixel_column * 8) + pixel_row;
                let q = quantization_table[RASTER_TO_ZIGZAG[coord] as usize];

                retval.quantization_table[coord] = q;
                retval.quantization_table_transposed[coord_tr] = q;
            }
        }

        for i in 0..14 {
            let coord = if i < 7 { i + 1 } else { (i - 6) * 8 };
            if retval.quantization_table[coord] < 9 {
                let mut freq_max = FREQ_MAX[i] + retval.quantization_table[coord] - 1;
                if retval.quantization_table[coord] != 0 {
                    freq_max /= retval.quantization_table[coord];
                }

                let max_len = u16_bit_length(freq_max);
                if max_len > RESIDUAL_NOISE_FLOOR as u8 {
                    retval.min_noise_threshold[i] = max_len - RESIDUAL_NOISE_FLOOR as u8;
                }
            }
        }

        retval
    }

    /// constructs the quantization table based on the jpeg header
    pub fn construct_quantization_tables(
        jpeg_header: &JpegHeader,
    ) -> Result<Vec<QuantizationTables>> {
        let mut quantization_tables = Vec::new();
        for i in 0..jpeg_header.cmpc {
            let qtables = QuantizationTables::new(jpeg_header, i);
            quantization_tables.push(qtables);
        }
        Ok(quantization_tables)
    }

    pub fn get_quantization_table(&self) -> &[u16; 64] {
        &self.quantization_table
    }

    pub fn get_quantization_table_transposed(&self) -> &[u16; 64] {
        &self.quantization_table_transposed
    }

    pub fn get_min_noise_threshold(&self, coef: usize) -> u8 {
        self.min_noise_threshold[coef]
    }
}


================================================
FILE: lib/src/structs/simple_hash.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

#![allow(dead_code)]

use std::num::Wrapping;

/// used for debugging when there are divergences between encoder and decoder
pub struct SimpleHash {
    hash: u64,
}

pub trait SimpleHashProvider {
    fn get_u64(&self) -> u64;
}

impl SimpleHashProvider for i32 {
    fn get_u64(&self) -> u64 {
        return *self as u64;
    }
}

impl SimpleHashProvider for u32 {
    fn get_u64(&self) -> u64 {
        return *self as u64;
    }
}

impl SimpleHashProvider for u64 {
    fn get_u64(&self) -> u64 {
        return *self;
    }
}

impl SimpleHash {
    pub fn new() -> Self {
        return SimpleHash { hash: 0 };
    }

    pub fn hash<T: SimpleHashProvider>(&mut self, v: T) {
        self.hash = (Wrapping(self.hash) * Wrapping(13u64) + Wrapping(v.get_u64())).0;
    }

    pub fn get(&self) -> u32 {
        return self.hash as u32;
    }
}


================================================
FILE: lib/src/structs/simple_threadpool.rs
================================================
/// A simple thread pool implementation that can be used to evaluate closures on separate threads.
///
/// The pool will keep a number of threads equal to the number of CPUs available on the system, and
/// will reuse threads that are idle.
///
/// If more tasks are submitted than there are threads, the pool will spawn new threads to handle
/// the extra tasks.
///
/// Why write yet another threadpool? There wasn't one that was that supported dynamically growing
/// the threadpool (rayon and tokio are all fixed), which is important since otherwise there is
/// unpredicable latency when the number of tasks submitted is greater than the number of threads.
///
/// No unsafe code is used.
use std::{
    sync::{
        Arc, LazyLock, Mutex,
        mpsc::{Sender, channel},
    },
    thread::{self, spawn},
};

/// A trait that defines the interface for a Lepton thread pool.
/// It has a simple fire-and-forget interface, which is sufficient for the current use cases,
/// but also requires the thread pool to be static, since we don't require the thread
/// to return within a specific lifetime.
pub trait LeptonThreadPool {
    /// Returns the maximum parallelism supported by the thread pool.
    fn max_parallelism(&self) -> usize;
    /// Runs a closure on a thread from the thread pool. The thread
    /// thread lifetime is not specified, so it can must be static.
    fn run(&self, f: Box<dyn FnOnce() + Send + 'static>);
}

/// Holds either a reference to a LeptonThreadPool or an owned Box<dyn LeptonThreadPool>.
///
/// This is useful for APIs that want to accept either a reference to a static or global thread pool
/// or an owned thread pool.
pub enum ThreadPoolHolder<'a> {
    /// Reference to a LeptonThreadPool
    Dyn(&'a dyn LeptonThreadPool),
    /// Owned Box<dyn LeptonThreadPool>
    Owned(Box<dyn LeptonThreadPool>),
}

impl LeptonThreadPool for ThreadPoolHolder<'_> {
    fn max_parallelism(&self) -> usize {
        match self {
            ThreadPoolHolder::Dyn(p) => p.max_parallelism(),
            ThreadPoolHolder::Owned(p) => p.max_parallelism(),
        }
    }
    fn run(&self, f: Box<dyn FnOnce() + Send + 'static>) {
        match self {
            ThreadPoolHolder::Dyn(p) => p.run(f),
            ThreadPoolHolder::Owned(p) => p.run(f),
        }
    }
}

/// Priority levels for threads in the thread pool.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum LeptonThreadPriority {
    /// Low priority thread
    Low,
    /// Normal priority thread, we don't touch the priority of these threads.
    #[default]
    Normal,
    /// High priority thread
    High,
}

/// A simple thread pool that spawns threads on demand and reuses them for executing closures.
/// There is no limit on the number of threads, but the number of idle threads is limited to the number of CPUs available.
#[derive(Default)]
pub struct SimpleThreadPool {
    priority: LeptonThreadPriority,
    idle_threads: LazyLock<Arc<Mutex<Vec<Sender<Box<dyn FnOnce() + Send + 'static>>>>>>,
}

impl SimpleThreadPool {
    /// Creates a new thread pool with the specified priority.
    pub const fn new(priority: LeptonThreadPriority) -> Self {
        SimpleThreadPool {
            priority,
            idle_threads: LazyLock::new(|| Arc::new(Mutex::new(Vec::new()))),
        }
    }

    /// Returns the number of idle threads in the thread pool.
    #[allow(dead_code)]
    pub fn get_idle_threads(&self) -> usize {
        self.idle_threads.lock().unwrap().len()
    }

    /// Executes a closure on a thread from the thread pool. Does not block or return any result.
    fn execute<F>(&self, f: F)
    where
        F: FnOnce() + Send + 'static,
    {
        if let Some(sender) = self.idle_threads.lock().unwrap().pop() {
            sender.send(Box::new(f)).unwrap();
        } else {
            // channel for receiving future work on this thread
            let (tx_schedule, rx_schedule) = channel();

            let priority = self.priority;
            let idle_threads = self.idle_threads.clone();

            spawn(move || {
                #[cfg(any(target_os = "windows", target_os = "linux"))]
                match priority {
                    LeptonThreadPriority::Low => thread_priority::set_current_thread_priority(
                        thread_priority::ThreadPriority::Min,
                    )
                    .unwrap(),
                    LeptonThreadPriority::Normal => {}
                    LeptonThreadPriority::High => thread_priority::set_current_thread_priority(
                        thread_priority::ThreadPriority::Max,
                    )
                    .unwrap(),
                }

                f();

                loop {
                    if let Ok(mut i) = idle_threads.lock() {
                        // stick back into list of idle threads if there aren't more than
                        // the number of cpus already there.
                        if i.len() > *NUM_CPUS {
                            // just exits the thread
                            break;
                        }
                        i.push(tx_schedule.clone());
                    } else {
                        break;
                    }

                    if let Ok(f) = rx_schedule.recv() {
                        f();
                    } else {
                        // channel broken, exit thread
                        break;
                    }
                }
            });
        }
    }
}

/// A default instance of the `SimpleThreadPool` that can be used for encoding and decoding operations.
pub static DEFAULT_THREAD_POOL: SimpleThreadPool =
    SimpleThreadPool::new(LeptonThreadPriority::Normal);

impl LeptonThreadPool for SimpleThreadPool {
    fn max_parallelism(&self) -> usize {
        *NUM_CPUS
    }
    fn run(&self, f: Box<dyn FnOnce() + Send + 'static>) {
        self.execute(f);
    }
}

static NUM_CPUS: LazyLock<usize> = LazyLock::new(|| thread::available_parallelism().unwrap().get());

#[test]
fn test_threadpool() {
    use std::sync::Arc;
    use std::sync::atomic::{AtomicU32, Ordering};

    let a: Arc<AtomicU32> = Arc::new(AtomicU32::new(0));

    for _i in 0usize..100 {
        let aref = a.clone();
        DEFAULT_THREAD_POOL.execute(move || {
            aref.fetch_add(1, Ordering::AcqRel);
        });
    }

    while a.load(std::sync::atomic::Ordering::Acquire) < 100 {
        thread::yield_now();
    }

    println!("Idle threads: {}", DEFAULT_THREAD_POOL.get_idle_threads());
}

/// single thread pool that creates that doesn't create any threads
#[derive(Default)]
pub struct SingleThreadPool {}

impl LeptonThreadPool for SingleThreadPool {
    fn max_parallelism(&self) -> usize {
        1
    }
    fn run(&self, _f: Box<dyn FnOnce() + Send + 'static>) {
        panic!("SingleThreadPool does not support run; execute directly instead");
    }
}


================================================
FILE: lib/src/structs/thread_handoff.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::io::{Read, Result, Write};

use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};

use crate::consts::COLOR_CHANNEL_NUM_BLOCK_TYPES;

#[derive(Debug, Clone, PartialEq)]
pub struct ThreadHandoff {
    pub luma_y_start: u32,
    pub luma_y_end: u32,
    pub segment_offset_in_file: u32,
    pub segment_size: u32,
    pub overhang_byte: u8,
    pub num_overhang_bits: u8,
    pub last_dc: [i16; 4],
}

impl ThreadHandoff {
    pub fn deserialize<R: Read>(num_threads: u8, data: &mut R) -> Result<Vec<ThreadHandoff>> {
        let mut retval: Vec<ThreadHandoff> = Vec::with_capacity(num_threads as usize);

        for _i in 0..num_threads {
            let mut th = ThreadHandoff {
                luma_y_start: data.read_u16::<LittleEndian>()? as u32,
                luma_y_end: 0,             // filled in later
                segment_offset_in_file: 0, // not serialized
                segment_size: data.read_u32::<LittleEndian>()?,
                overhang_byte: data.read_u8()?,
                num_overhang_bits: data.read_u8()?,
                last_dc: [0; 4],
            };

            for j in 0..COLOR_CHANNEL_NUM_BLOCK_TYPES {
                th.last_dc[j] = data.read_i16::<LittleEndian>()?
            }
            for _j in COLOR_CHANNEL_NUM_BLOCK_TYPES..4 {
                data.read_u16::<LittleEndian>()?;
            }

            retval.push(th);
        }

        for i in 1..retval.len() {
            retval[i - 1].luma_y_end = retval[i].luma_y_start;
        }

        // last LumaYEnd is not serialzed, filled in later
        return Ok(retval);
    }

    pub fn serialize<W: Write>(data: &Vec<ThreadHandoff>, retval: &mut W) -> Result<()> {
        retval.write_u8(data.len() as u8)?;

        for th in data {
            retval.write_u16::<LittleEndian>(th.luma_y_start as u16)?;
            // SegmentOffsetInFile is not serialized to preserve compatibility with original Lepton format
            retval.write_i32::<LittleEndian>(th.segment_size as i32)?;
            retval.write_u8(th.overhang_byte)?;
            retval.write_u8(th.num_overhang_bits)?;

            for i in 0..COLOR_CHANNEL_NUM_BLOCK_TYPES {
                retval.write_i16::<LittleEndian>(th.last_dc[i])?;
            }
            for _i in COLOR_CHANNEL_NUM_BLOCK_TYPES..4 {
                retval.write_u16::<LittleEndian>(0)?;
            }
        }

        return Ok(());
    }

    // Combine two ThreadHandoff objects into a range, starting with the "from" segment, and
    // continuing until the end of the "to" segment [from, to]
    pub fn get_combine_thread_range_segment_size(
        from: &ThreadHandoff,
        to: &ThreadHandoff,
    ) -> usize {
        return (to.segment_offset_in_file - from.segment_offset_in_file + to.segment_size)
            as usize;
    }

    pub fn combine_thread_ranges(from: &ThreadHandoff, to: &ThreadHandoff) -> ThreadHandoff {
        let ret = ThreadHandoff {
            segment_offset_in_file: from.segment_offset_in_file,
            luma_y_start: from.luma_y_start,
            overhang_byte: from.overhang_byte,
            num_overhang_bits: from.num_overhang_bits,
            luma_y_end: to.luma_y_end,
            segment_size: ThreadHandoff::get_combine_thread_range_segment_size(from, to) as u32,
            last_dc: from.last_dc,
        };

        return ret;
    }
}


================================================
FILE: lib/src/structs/vpx_bool_reader.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE banner below
 *  An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the VPX_AUTHORS file in this directory
 */
/*
Copyright (c) 2010, Google Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
Neither the name of Google nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use std::io::{Read, Result};

use crate::lepton_error;
use crate::lepton_error::{ExitCode, err_exit_code};
use crate::metrics::{Metrics, ModelComponent};
use crate::structs::branch::Branch;
use crate::structs::simple_hash::SimpleHash;

const BITS_IN_BYTE: u32 = 8;
const BITS_IN_VALUE: u32 = 64;
const BITS_IN_VALUE_MINUS_LAST_BYTE: u32 = BITS_IN_VALUE - BITS_IN_BYTE;
const VALUE_MASK: u64 = (1 << BITS_IN_VALUE_MINUS_LAST_BYTE) - 1;

pub struct VPXBoolReader<R> {
    value: u64,
    range: u64, // 128 << BITS_IN_VALUE_MINUS_LAST_BYTE <= range <= 255 << BITS_IN_VALUE_MINUS_LAST_BYTE
    upstream_reader: R,
    model_statistics: Metrics,
    #[allow(dead_code)]
    pub hash: SimpleHash,
}

impl<R: Read> VPXBoolReader<R> {
    pub fn new(reader: R) -> lepton_error::Result<Self> {
        let mut r = VPXBoolReader {
            upstream_reader: reader,
            value: 1 << (BITS_IN_VALUE - 1), // guard bit
            range: 255 << BITS_IN_VALUE_MINUS_LAST_BYTE,
            model_statistics: Metrics::default(),
            hash: SimpleHash::new(),
        };

        let mut dummy_branch = Branch::new();
        let bit = r.get_bit(&mut dummy_branch, ModelComponent::Dummy)?; // marker false bit
        if bit {
            return err_exit_code(ExitCode::StreamInconsistent, "StreamInconsistent");
        }

        return Ok(r);
    }

    pub fn drain_stats(&mut self) -> Metrics {
        self.model_statistics.drain()
    }

    // Lepton uses VP8 adaptive arithmetic coding scheme, where bits are extracted from file stream
    // by "division" of current 8-bit stream `value` by adaptive 8-bit `split`. Adaptation is achieved by
    // combination of predicted probability to get false bit (`1 <= probability <= 255`, in 1/256 units),
    // and `range` that represents maximum possible value of yet-not-decoded stream part (so that
    // `range > value`, `128 <= range <= 256` in units of $2^{-n-8}$ for the `n` bits already consumed)
    // by forming predictor `split = 1 + (((range - 1) * probability) >> BITS_IN_BYTE)`,
    // `1 <= split <= range - 1`. Comparison of predictor with stream gives the next decoded bit:
    // true for `value >= split` and false otherwise - this is effectively division step.
    // After this we shrink `value` and `range` by `split` for true or shrink `range` to `split`
    // for false and update `probability`. Now `range` can get out of allowable range and we restore it
    // by shifting left both `range` and `value` with corresponding filling of `value` by further
    // stream bits (it corresponds to bring down new digit in division, and since `range > value` is invariant
    // of the operations, shifted out `value` bits are guaranteed to be 0). Repeat until stream ends.
    //
    // Reference: https://datatracker.ietf.org/doc/html/rfc6386#section-7.
    //
    // Here some improvements to the basic scheme are implemented. First, we store more stream bits
    // in `value` to reduce refill rate, so that 8 MSBs of `value` represent `value` of the scheme
    // (it was already implemented in DropBox version, however, with shorter 16-bit `value`).
    // Second, `range` and `split` are also stored in 8 MSBs of the same size variables (it is new
    // and it allows to reduce number of operations to compute `split` - previously `big_split` -
    // and to update `range` and `shift`). Third, we use local values for all stream state variables
    // to reduce number of memory load/store operations in decoding of many-bit values. Fourth,
    // we use in `value` a set bit after the stream bits as a guard - completely getting rid
    // of bit counter and not changing comparison result `value >= split`.
    #[inline(always)]
    pub fn get(
        &mut self,
        branch: &mut Branch,
        tmp_value: &mut u64,
        tmp_range: &mut u64,
        _cmp: ModelComponent,
    ) -> bool {
        let probability = branch.get_probability() as u64;

        let split = mul_prob(*tmp_range, probability);

        // So optimizer understands that 0 should never happen and uses a cold jump
        // if we don't have LZCNT on x86 CPUs (older BSR instruction requires check for zero).
        // This is better since the branch prediction figures quickly this never happens and can run
        // the code sequentially.
        #[cfg(all(
            not(target_feature = "lzcnt"),
            any(target_arch = "x86", target_arch = "x86_64")
        ))]
        assert!(*tmp_range - split > 0);

        let bit = *tmp_value >= split;

        branch.record_and_update_bit(bit);

        if bit {
            *tmp_range -= split;
            *tmp_value -= split;
        } else {
            *tmp_range = split;
        }

        let shift = (*tmp_range).leading_zeros();

        *tmp_value <<= shift;
        *tmp_range <<= shift;

        #[cfg(feature = "compression_stats")]
        {
            self.model_statistics
                .record_compression_stats(_cmp, 1, i64::from(shift));
        }

        #[cfg(feature = "detailed_tracing")]
        {
            self.hash.hash(branch.get_u64());
            self.hash.hash(*tmp_value);
            self.hash.hash(*tmp_range);

            let hash = self.hash.get();
            //if hash == 0x88f9c945
            {
                print!("({0}:{1:x})", bit as u8, hash);
                if hash % 8 == 0 {
                    println!();
                }
            }
        }

        bit
    }

    #[inline(always)]
    pub fn get_grid<const A: usize>(
        &mut self,
        branches: &mut [Branch; A],
        _cmp: ModelComponent,
    ) -> Result<usize> {
        // check if A is a power of 2
        debug_assert!((A & (A - 1)) == 0);

        let mut tmp_value = self.value;
        let mut tmp_range = self.range;

        let mut decoded_so_far = 1;
        // We can read only each 7-th iteration: minimum 56 bits are in `value` after `vpx_reader_fill`,
        // and one `get` needs 8 bits but consumes at most 7 bits (with `range` coming from >127 to 1).
        // As Lepton uses only 3 and 6 iterations, we can read only once.
        debug_assert!(A <= 128);
        tmp_value = Self::vpx_reader_fill(tmp_value, &mut self.upstream_reader)?;

        for _index in 0..A.ilog2() {
            let cur_bit = self.get(
                &mut branches[decoded_so_far],
                &mut tmp_value,
                &mut tmp_range,
                _cmp,
            ) as usize;
            decoded_so_far <<= 1;
            decoded_so_far |= cur_bit;
        }

        // remove set leading bit
        let value = decoded_so_far ^ A;

        self.value = tmp_value;
        self.range = tmp_range;

        Ok(value)
    }

    #[inline(always)]
    pub fn get_unary_encoded<const A: usize>(
        &mut self,
        branches: &mut [Branch; A],
        _cmp: ModelComponent,
    ) -> Result<usize> {
        let mut tmp_value = self.value;
        let mut tmp_range = self.range;

        for value in 0..A {
            let split = mul_prob(tmp_range, branches[value].get_probability() as u64);

            // We know that after this we have min 56 stream bits in `tmp_value`,
            // and can have at least 7 iterations, so we can decode 7 bits at once.
            // Each iteration needs at least 8 bits of stream in `tmp_value` and
            // consumes max 7 of them.
            debug_assert!(A <= 14);
            if value == 0 || value == 7 {
                tmp_value = Self::vpx_reader_fill(tmp_value, &mut self.upstream_reader)?;
            }

            if tmp_value >= split {
                branches[value].record_and_update_bit(true);

                tmp_range -= split;
                tmp_value -= split;

                let shift = tmp_range.leading_zeros();

                tmp_value <<= shift;
                tmp_range <<= shift;

                #[cfg(feature = "compression_stats")]
                {
                    self.model_statistics
                        .record_compression_stats(_cmp, 1, i64::from(shift));
                }

                #[cfg(feature = "detailed_tracing")]
                {
                    self.hash.hash(branches[value].get_u64());
                    self.hash.hash(tmp_value);
                    self.hash.hash(tmp_range);

                    let hash = self.hash.get();
                    //if hash == 0x88f9c945
                    {
                        print!("({0}:{1:x})", true as u8, hash);
                        if hash % 8 == 0 {
                            println!();
                        }
                    }
                }
            } else {
                branches[value].record_and_update_bit(false);

                tmp_range = split;

                let shift = tmp_range.leading_zeros();

                tmp_value <<= shift;
                tmp_range <<= shift;

                #[cfg(feature = "compression_stats")]
                {
                    self.model_statistics
                        .record_compression_stats(_cmp, 1, i64::from(shift));
                }

                #[cfg(feature = "detailed_tracing")]
                {
                    self.hash.hash(branches[value].get_u64());
                    self.hash.hash(tmp_value);
                    self.hash.hash(tmp_range);

                    let hash = self.hash.get();
                    //if hash == 0x88f9c945
                    {
                        print!("({0}:{1:x})", false as u8, hash);
                        if hash % 8 == 0 {
                            println!();
                        }
                    }
                }

                self.value = tmp_value;
                self.range = tmp_range;

                return Ok(value);
            }
        }

        self.value = tmp_value;
        self.range = tmp_range;

        Ok(A)
    }

    #[inline(always)]
    pub fn get_n_bits<const A: usize>(
        &mut self,
        n: usize,
        branches: &mut [Branch; A],
        _cmp: ModelComponent,
    ) -> Result<usize> {
        assert!(n <= branches.len());

        let mut tmp_value = self.value;
        let mut tmp_range = self.range;

        let mut coef = 0;
        for i in (0..n).rev() {
            // Here the fastest way is to use condition of `get_bit`, presumably as
            // this loop cannot be unrolled due to vaiable iterations number.
            // Moreover, this condition holds very rarely as `value` is usually already filled
            // by previous `get_bit` sign reading.
            if tmp_value & VALUE_MASK == 0 {
                tmp_value = Self::vpx_reader_fill(tmp_value, &mut self.upstream_reader)?;
            }

            coef |=
                (self.get(&mut branches[i], &mut tmp_value, &mut tmp_range, _cmp) as usize) << i;
        }

        self.value = tmp_value;
        self.range = tmp_range;

        return Ok(coef);
    }

    #[inline(always)]
    pub fn get_bit(&mut self, branch: &mut Branch, _cmp: ModelComponent) -> Result<bool> {
        let mut tmp_value = self.value;
        let mut tmp_range = self.range;

        // We ensure that the guard bit never comes into the first byte,
        // thus having in `value` at least 8 stream bits.
        if tmp_value & VALUE_MASK == 0 {
            tmp_value = Self::vpx_reader_fill(tmp_value, &mut self.upstream_reader)?;
        }

        let bit = self.get(branch, &mut tmp_value, &mut tmp_range, _cmp);

        self.value = tmp_value;
        self.range = tmp_range;

        return Ok(bit);
    }

    // Fill `tmp_value` maximally still preserving space for the guard bit,
    // after this returned value has `56 | (63 - shift)` stream bits
    #[inline(always)]
    fn vpx_reader_fill(mut tmp_value: u64, upstream_reader: &mut R) -> Result<u64> {
        // This `if` does not change performance but drops down instructions count by 3 %
        if tmp_value & 0xFF == 0 {
            let mut shift: i32 = tmp_value.trailing_zeros() as i32;
            // Unset the last guard bit and set a new one
            tmp_value &= tmp_value - 1;
            tmp_value |= 1 << (shift & 7);

            // BufReader is already pretty efficient handling small reads, so optimization doesn't help that much
            let mut v = [0u8; 1];
            shift -= 7;

            while shift > 0 {
                let bytes_read = upstream_reader.read(&mut v)?;
                if bytes_read == 0 {
                    break;
                }

                tmp_value |= (v[0] as u64) << shift;
                shift -= 8;
            }
        }

        return Ok(tmp_value);
    }
}

fn mul_prob(tmp_range: u64, probability: u64) -> u64 {
    ((((tmp_range - (1 << BITS_IN_VALUE_MINUS_LAST_BYTE)) >> 8) * probability)
        & (0xFF << BITS_IN_VALUE_MINUS_LAST_BYTE))
        + (1 << BITS_IN_VALUE_MINUS_LAST_BYTE)
}


================================================
FILE: lib/src/structs/vpx_bool_writer.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE banner below
 *  An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the VPX_AUTHORS file in this directory
 */
/*
Copyright (c) 2010, Google Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
Neither the name of Google nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use std::io::{Result, Write};

use crate::helpers::needs_to_grow;
use crate::metrics::{Metrics, ModelComponent};
use crate::structs::branch::Branch;
use crate::structs::simple_hash::SimpleHash;

pub struct VPXBoolWriter<W> {
    low_value: u64,
    range: u32,
    writer: W,
    buffer: Vec<u8>,
    model_statistics: Metrics,
    #[allow(dead_code)]
    pub hash: SimpleHash,
}

impl<W: Write> VPXBoolWriter<W> {
    pub fn new(writer: W) -> Result<Self> {
        let mut retval = VPXBoolWriter {
            low_value: 1 << 9, // this divider bit keeps track of stream bits number
            range: 255,
            buffer: Vec::new(),
            writer: writer,
            model_statistics: Metrics::default(),
            hash: SimpleHash::new(),
        };

        let mut dummy_branch = Branch::new();
        // initial false bit is put to not get carry out of stream bits
        retval.put_bit(false, &mut dummy_branch, ModelComponent::Dummy)?;

        Ok(retval)
    }

    pub fn drain_stats(&mut self) -> Metrics {
        self.model_statistics.drain()
    }

    #[inline(always)]
    pub fn put(
        &mut self,
        bit: bool,
        branch: &mut Branch,
        mut tmp_value: u64,
        mut tmp_range: u32,
        _cmp: ModelComponent,
    ) -> (u64, u32) {
        #[cfg(feature = "detailed_tracing")]
        {
            // used to detect divergences between the C++ and rust versions
            self.hash.hash(branch.get_u64());
            self.hash.hash(tmp_value);
            self.hash.hash(tmp_range);

            let hashed_value = self.hash.get();
            //if hashedValue == 0xe35c28fd
            {
                print!("({0}:{1:x})", bit as u8, hashed_value);
                if hashed_value % 8 == 0 {
                    println!();
                }
            }
        }

        let probability = branch.get_probability() as u32;

        let split = 1 + (((tmp_range - 1) * probability) >> 8);

        branch.record_and_update_bit(bit);

        if bit {
            tmp_value += split as u64;
            tmp_range -= split;
        } else {
            tmp_range = split;
        }

        let shift = (tmp_range as u8).leading_zeros();

        #[cfg(feature = "compression_stats")]
        {
            self.model_statistics
                .record_compression_stats(_cmp, 1, i64::from(shift));
        }

        tmp_range <<= shift;
        tmp_value <<= shift;

        // check whether we cannot put next bit into stream
        if tmp_value & (u64::MAX << 57) != 0 {
            // calculate the number odd bits left over after we remove:
            // - 48 bits (6 bytes) flushed to buffer
            // - 8 bits need to keep for coding accuracy (since probability resolution is 8 bits)
            // - 1 bit for marker
            // - 1 bit for overflow
            //
            // leftover_bits will always be <= 8
            let leftover_bits = tmp_value.leading_zeros() + 2;

            // shift align so that the top 6 bytes are ones we want to write, if there
            // was an overflow it gets rotated down to the bottom bit
            let v_aligned = tmp_value.rotate_left(leftover_bits);

            if (v_aligned & 1) != 0 {
                self.carry();
            }

            // Append the top six bytes of the u64 into buffer in big endian so that the top byte goes first.
            if needs_to_grow(&self.buffer, 8) {
                // avoid inlining slow path to allocate more memory that happens almost never
                put_6bytes(&mut self.buffer, v_aligned);
            } else {
                // Faster to add all 8 and then shrink the buffer than add 6 that creates a temporary buffer.
                let b = v_aligned.to_be_bytes();
                self.buffer.extend_from_slice(&b);
                self.buffer.truncate(self.buffer.len() - 2);
            }

            // mask the remaining bits (between 8 and 16) and put them back to where they were
            // adding the marker bit to the top
            tmp_value = ((v_aligned & 0xffff) | 0x20000/*marker bit*/) >> leftover_bits;
        }

        (tmp_value, tmp_range)
    }

    /// Safe as: at the stream beginning initially put `false` ensure that carry cannot get out
    /// of the first stream byte - then `carry` cannot be invoked on empty `buffer`,
    /// and after the stream beginning `flush_non_final_data` keeps carry-terminating
    /// byte sequence (one non-255-byte before any number of 255-bytes) inside the `buffer`.
    ///
    /// Cold to keep this out of the inner loop since carries are pretty rare
    #[cold]
    #[inline(never)]
    fn carry(&mut self) {
        let mut x = self.buffer.len() - 1;

        while self.buffer[x] == 0xFF {
            self.buffer[x] = 0;

            assert!(x > 0);
            x -= 1;
        }

        self.buffer[x] += 1;
    }

    #[inline(always)]
    pub fn put_grid<const A: usize>(
        &mut self,
        v: u8,
        branches: &mut [Branch; A],
        cmp: ModelComponent,
    ) -> Result<()> {
        // check if A is a power of 2
        assert!((A & (A - 1)) == 0);
        let mut tmp_value = self.low_value;
        let mut tmp_range = self.range;

        let mut index = A.ilog2() - 1;
        let mut serialized_so_far = 1;

        loop {
            let cur_bit = (v & (1 << index)) != 0;
            (tmp_value, tmp_range) = self.put(
                cur_bit,
                &mut branches[serialized_so_far],
                tmp_value,
                tmp_range,
                cmp,
            );

            if index == 0 {
                break;
            }

            serialized_so_far <<= 1;
            serialized_so_far |= cur_bit as usize;

            index -= 1;
        }

        self.low_value = tmp_value;
        self.range = tmp_range;

        Ok(())
    }

    #[inline(always)]
    pub fn put_n_bits<const A: usize>(
        &mut self,
        bits: usize,
        num_bits: usize,
        branches: &mut [Branch; A],
        cmp: ModelComponent,
    ) -> Result<()> {
        let mut tmp_value = self.low_value;
        let mut tmp_range = self.range;

        let mut i: i32 = (num_bits - 1) as i32;
        while i >= 0 {
            (tmp_value, tmp_range) = self.put(
                (bits & (1 << i)) != 0,
                &mut branches[i as usize],
                tmp_value,
                tmp_range,
                cmp,
            );
            i -= 1;
        }

        self.low_value = tmp_value;
        self.range = tmp_range;

        Ok(())
    }

    #[inline(always)]
    pub fn put_unary_encoded<const A: usize>(
        &mut self,
        v: usize,
        branches: &mut [Branch; A],
        cmp: ModelComponent,
    ) -> Result<()> {
        assert!(v <= A);

        let mut tmp_value = self.low_value;
        let mut tmp_range = self.range;

        for i in 0..A {
            let cur_bit = v != i;

            (tmp_value, tmp_range) = self.put(cur_bit, &mut branches[i], tmp_value, tmp_range, cmp);
            if !cur_bit {
                break;
            }
        }

        self.low_value = tmp_value;
        self.range = tmp_range;

        Ok(())
    }

    #[inline(always)]
    pub fn put_bit(
        &mut self,
        value: bool,
        branch: &mut Branch,
        _cmp: ModelComponent,
    ) -> Result<()> {
        let mut tmp_value = self.low_value;
        let mut tmp_range = self.range;

        (tmp_value, tmp_range) = self.put(value, branch, tmp_value, tmp_range, _cmp);

        self.low_value = tmp_value;
        self.range = tmp_range;

        Ok(())
    }

    // Here we write down only bytes of the stream necessary for decoding -
    // opposite to initial Lepton implementation that writes down all the buffer.
    pub fn finish(&mut self) -> Result<()> {
        let mut tmp_value = self.low_value;
        let stream_bits = 64 - tmp_value.leading_zeros() - 2;
        // 55 >= stream_bits >= 8

        tmp_value <<= 63 - stream_bits;
        if tmp_value & (1 << 63) != 0 {
            self.carry();
        }

        let mut shift = 63;
        for _stream_bytes in 0..(stream_bits + 7) >> 3 {
            shift -= 8;
            self.buffer.push((tmp_value >> shift) as u8);
        }
        // check that no stream bits remain in the buffer
        debug_assert!(!(u64::MAX << shift) & tmp_value == 0);

        self.writer.write_all(&self.buffer[..])?;
        Ok(())
    }

    /// When buffer is full and is going to be sent to output, preserve buffer data that
    /// is not final and should be carried over to the next buffer. At least one byte
    /// will remain in `buffer` if it is non-empty.
    pub fn flush_non_final_data(&mut self) -> Result<()> {
        // carry over buffer data that might be not final
        let mut i = self.buffer.len();
        if i > 1 {
            i -= 1;
            while self.buffer[i] == 0xFF {
                assert!(i > 0);
                i -= 1;
            }

            self.writer.write_all(&self.buffer[..i])?;
            self.buffer.drain(..i);
        }

        Ok(())
    }
}

#[cold]
#[inline(never)]
fn put_6bytes(buffer: &mut Vec<u8>, v: u64) {
    let b = v.to_be_bytes();
    buffer.extend_from_slice(b[0..6].as_ref());
}

#[cfg(test)]
use crate::structs::vpx_bool_reader::VPXBoolReader;

#[test]
fn test_roundtrip_vpxboolwriter_n_bits() {
    const MAX_N: usize = 8;

    #[derive(Default)]
    struct BranchData {
        branches: [Branch; MAX_N],
    }

    let mut buffer = Vec::new();
    let mut writer = VPXBoolWriter::new(&mut buffer).unwrap();

    let mut branches = BranchData::default();

    for i in 0..1024 {
        writer
            .put_n_bits(
                i as usize % 256,
                MAX_N,
                &mut branches.branches,
                ModelComponent::Dummy,
            )
            .unwrap();
    }

    writer.finish().unwrap();

    let mut branches = BranchData::default();

    let mut reader = VPXBoolReader::new(&buffer[..]).unwrap();
    for i in 0..1024 {
        let read_value = reader
            .get_n_bits(MAX_N, &mut branches.branches, ModelComponent::Dummy)
            .unwrap();
        assert_eq!(read_value, i as usize % 256);
    }
}

#[test]
fn test_roundtrip_vpxboolwriter_unary() {
    const MAX_UNARY: usize = 11; // the size used in Lepton

    #[derive(Default)]
    struct BranchData {
        branches: [Branch; MAX_UNARY],
    }

    let mut buffer = Vec::new();
    let mut writer = VPXBoolWriter::new(&mut buffer).unwrap();

    let mut branches = BranchData::default();

    for i in 0..1024 {
        writer
            .put_unary_encoded(
                i as usize % (MAX_UNARY + 1),
                &mut branches.branches,
                ModelComponent::Dummy,
            )
            .unwrap();
    }

    writer.finish().unwrap();

    let mut branches = BranchData::default();

    let mut reader = VPXBoolReader::new(&buffer[..]).unwrap();
    for i in 0..1024 {
        let read_value = reader
            .get_unary_encoded(&mut branches.branches, ModelComponent::Dummy)
            .unwrap();
        assert_eq!(read_value, i as usize % (MAX_UNARY + 1));
    }
}

#[test]
fn test_roundtrip_vpxboolwriter_grid() {
    #[derive(Default)]
    struct BranchData {
        branches: [Branch; 8],
    }

    let mut buffer = Vec::new();
    let mut writer = VPXBoolWriter::new(&mut buffer).unwrap();

    let mut branches = BranchData::default();

    for i in 0..1024 {
        writer
            .put_grid(i as u8 % 8, &mut branches.branches, ModelComponent::Dummy)
            .unwrap();
    }

    writer.finish().unwrap();

    let mut branches = BranchData::default();

    let mut reader = VPXBoolReader::new(&buffer[..]).unwrap();
    for i in 0..1024 {
        let read_value = reader
            .get_grid(&mut branches.branches, ModelComponent::Dummy)
            .unwrap();
        assert_eq!(read_value, i as usize % 8);
    }
}

#[test]
fn test_roundtrip_vpxboolwriter_single_bit() {
    let mut buffer = Vec::new();
    let mut writer = VPXBoolWriter::new(&mut buffer).unwrap();

    let mut branch = Branch::default();

    for i in 0..1024 {
        writer
            .put_bit(i % 10 == 0, &mut branch, ModelComponent::Dummy)
            .unwrap();
    }

    writer.finish().unwrap();

    let mut branch = Branch::default();

    let mut reader = VPXBoolReader::new(&buffer[..]).unwrap();
    for i in 0..1024 {
        let read_value = reader.get_bit(&mut branch, ModelComponent::Dummy).unwrap();
        assert_eq!(read_value, i % 10 == 0);
    }
}


================================================
FILE: package/Lepton.Jpeg.Rust.nuspec
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://schemas.microsoft.com/packaging/2011/08/nuspec.xsd">
  <metadata>
    <id>Lepton.Jpeg.Rust</id>
    <version>0.5.5.8</version>
    <title>Lepton JPEG Compression Rust version binaries and libraries</title>
    <authors>kristofr</authors>
    <owners>kristofr</owners>
    <requireLicenseAcceptance>false</requireLicenseAcceptance>
    <description>Lepton Rust binaries and libraries</description>
    <tags>lepton</tags>
  </metadata>
  <files>
    <file src="..\target\release\lepton_jpeg_util.exe" target="exe\release\x64" />
    <file src="..\target\release\lepton_jpeg_util.pdb" target="exe\release\x64" />
    <file src="..\target\release\lepton_jpeg_util_avx2.exe" target="exe\release\x64" />
    <file src="..\target\release\lepton_jpeg_util_avx2.pdb" target="exe\release\x64" />
    <file src="..\target\release\lepton_jpeg.dll" target="lib\release\x64" />
    <file src="..\target\release\lepton_jpeg.pdb" target="lib\release\x64" />
    <file src="..\target\release\lepton_jpeg_avx2.dll" target="lib\release\x64" />
    <file src="..\target\release\lepton_jpeg_avx2.pdb" target="lib\release\x64" />
  </files>
</package>

================================================
FILE: python/Cargo.toml
================================================
[package]
name = "lepton_jpeg_python"
version.workspace = true
edition = "2024"

[lib]
crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.27", features = ["extension-module"] }
lepton_jpeg = { path = "../lib" }
rayon = "1"


================================================
FILE: python/README.md
================================================
# Lepton JPEG Compression 

This is a port of the C++ Lepton JPEG compression tool that was released by DropBox [dropbox/lepton](https://github.com/dropbox/lepton). We developed a port of the library to Rust, which has basically the same performance characteristics with the advantage of all the safety features that Rust has to offer, due to the work involved in performing an exhaustive security check on the C++ code and the fact that DropBox has deprecated the codebase.

With precise bit-by-bit recovery of the original JPEG, the Lepton compression library is designed for lossless compression of baseline and progressive JPEGs up to 22%. JPEG storage in a cloud storage system is the main application case. Even metadata headers and invalid content are kept in good condition.


## How to Use This Library

The library exposes two methods, compress and decompress, which can be invoked as follows:

``` python

    with open("my image", "rb") as f:
        jpg_data = f.read()

    config = {"max_jpeg_width": 4096 }
    compressed = lepton_jpeg_python.compress_bytes(jpg_data, config)
    decompressed = lepton_jpeg_python.decompress_bytes(compressed)

    assert jpg_data == decompressed
```    

The following config options are supported:
- max_jpeg_width: reject compressing images wider than this
- max_jpeg_height: reject compressioning images taller than this
- progressive: false to forbid compressing progressive JPEGs
- reject_dqts_with_zeros: true if we should reject JPEGs with 0 in their quantitization table
- max_partitions: maximum number of partitions to split JPEG into in order to allow for parallel compression/decompression
- max_jpeg_file_size: reject JPEGs larger than this

## Contributing

There are many ways in which you can participate in this project, for example:

* [Submit bugs and feature requests](https://github.com/microsoft/lepton_jpeg_rust/issues), and help us verify as they are checked in
* Review [source code changes](https://github.com/microsoft/lepton_jpeg_rust/pulls) or submit your own features as pull requests.
* The library uses only **stable features**, so if you want to take advantage of SIMD features such as AVX2, use the Wide crate (see the idct.rs as an example) rather than intrinsics. 

## Code of Conduct

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

## License

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the [Apache 2.0](LICENSE.txt) license.


================================================
FILE: python/pyproject.toml
================================================
[build-system]
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"

[project]
name = "lepton_jpeg_python"
version = "0.5.8"
description = "Rust port of the Lepton JPEG compression library"
authors = [{ name = "Kristof Roomp  ", email = "kristofr@gmail.com" }]
readme = "README.md"
license = "Apache-2.0"
requires-python = ">=3.8"
classifiers = [
    "Programming Language :: Python :: 3",
    "Programming Language :: Rust",
    "Operating System :: OS Independent",
]

[project.urls]
Homepage = "https://github.com/microsoft/lepton_jpeg_rust"


================================================
FILE: python/src/lib.rs
================================================
use lepton_jpeg::{DEFAULT_THREAD_POOL, LeptonThreadPool, SingleThreadPool};
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyDict};
use std::io::Cursor;
use std::sync::LazyLock;

enum ThreadOptions {
    SingleThread,
    PerCpu,
    NoLimit,
}

struct RayonThreadPool {
    pool: LazyLock<rayon::ThreadPool>,
}

impl LeptonThreadPool for RayonThreadPool {
    fn run(&self, f: Box<dyn FnOnce() + Send + 'static>) {
        self.pool.spawn(f);
    }
    fn max_parallelism(&self) -> usize {
        std::thread::available_parallelism().unwrap().get()
    }
}

static RAYON_THREAD_POOL: RayonThreadPool = RayonThreadPool {
    pool: LazyLock::new(|| rayon::ThreadPoolBuilder::new().build().unwrap()),
};

fn parse_config(
    config: Option<&Bound<'_, PyDict>>,
) -> PyResult<(lepton_jpeg::EnabledFeatures, ThreadOptions)> {
    let mut features = lepton_jpeg::EnabledFeatures::compat_lepton_vector_write();

    let mut threads = ThreadOptions::PerCpu;

    if let Some(cfg) = config {
        for (key, value) in cfg.iter() {
            let key_str: &str = key.extract()?;
            match key_str {
                "max_jpeg_width" => {
                    let val: u32 = value.extract()?;
                    features.max_jpeg_width = val;
                }
                "max_jpeg_height" => {
                    let val: u32 = value.extract()?;
                    features.max_jpeg_height = val;
                }
                "progressive" => {
                    let val: bool = value.extract()?;
                    features.progressive = val;
                }
                "reject_dqts_with_zeros" => {
                    let val: bool = value.extract()?;
                    features.reject_dqts_with_zeros = val;
                }
                "max_partitions" => {
                    let val: u32 = value.extract()?;
                    features.max_partitions = val;
                }
                "max_jpeg_file_size" => {
                    let val: u32 = value.extract()?;
                    features.max_jpeg_file_size = val;
                }
                "threads" => match value.extract::<&str>()? {
                    "single" => threads = ThreadOptions::SingleThread,
                    "per_cpu" => threads = ThreadOptions::PerCpu,
                    "no_limit" => threads = ThreadOptions::NoLimit,
                    _ => {
                        return Err(pyo3::exceptions::PyValueError::new_err(format!(
                            "Invalid threads option: {}",
                            value.extract::<&str>()?
                        )));
                    }
                },
                _ => {
                    return Err(pyo3::exceptions::PyValueError::new_err(format!(
                        "Unknown configuration key: {}",
                        key_str
                    )));
                }
            }
        }
    }
    Ok((features, threads))
}

#[pyfunction]
#[pyo3(signature = (data, config=None))]
pub fn compress_bytes(
    py: Python,
    data: &[u8],
    config: Option<&Bound<'_, PyDict>>,
) -> PyResult<Py<PyAny>> {
    let mut compressed = Vec::new();

    let (features, threads) = parse_config(config)?;
    let single = SingleThreadPool::default();

    lepton_jpeg::encode_lepton(
        &mut Cursor::new(data),
        &mut Cursor::new(&mut compressed),
        &features,
        match threads {
            ThreadOptions::SingleThread => &single,
            ThreadOptions::PerCpu => &RAYON_THREAD_POOL,
            ThreadOptions::NoLimit => &DEFAULT_THREAD_POOL,
        },
    )
    .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Compression failed: {}", e)))?;

    Ok(PyBytes::new(py, &compressed).into())
}

#[pyfunction]
#[pyo3(signature = (data, config=None))]
pub fn decompress_bytes(
    py: Python,
    data: &[u8],
    config: Option<&Bound<'_, PyDict>>,
) -> PyResult<Py<PyAny>> {
    let mut decompressed = Vec::new();

    let (features, threads) = parse_config(config)?;
    let single = SingleThreadPool::default();

    lepton_jpeg::decode_lepton(
        &mut Cursor::new(data),
        &mut Cursor::new(&mut decompressed),
        &features,
        match threads {
            ThreadOptions::SingleThread => &single,
            ThreadOptions::PerCpu => &RAYON_THREAD_POOL,
            ThreadOptions::NoLimit => &DEFAULT_THREAD_POOL,
        },
    )
    .map_err(|e| {
        pyo3::exceptions::PyRuntimeError::new_err(format!("Decompression failed: {}", e))
    })?;

    Ok(PyBytes::new(py, &decompressed).into())
}

#[pymodule]
fn lepton_jpeg_python(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_function(wrap_pyfunction!(compress_bytes, m)?)?;
    m.add_function(wrap_pyfunction!(decompress_bytes, m)?)?;
    Ok(())
}


================================================
FILE: python/tests/test_compress.py
================================================
import lepton_jpeg_python

def test_compress_decompress():
    # load slr city from images directory
    with open("../images/slrcity.jpg", "rb") as f:
        jpg_data = f.read()

    config = {
        "max_jpeg_width": 8196,
        "max_jpeg_height": 8196,
        "progressive": False,
        "reject_dqts_with_zeros": True,
        "max_partitions": 8,
        "max_jpeg_file_size": 128 * 1024 * 1024 }

    compressed = lepton_jpeg_python.compress_bytes(jpg_data, config)
    decompressed = lepton_jpeg_python.decompress_bytes(compressed, config)

    assert jpg_data == decompressed
    print("Compression and decompression successful!")

================================================
FILE: rustfmt.toml
================================================


================================================
FILE: tests/end_to_end.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use core::result::Result;
use std::fs::read_dir;
use std::io::Cursor;
use std::path::Path;

use lepton_jpeg::{
    DEFAULT_THREAD_POOL, EnabledFeatures, decode_lepton, encode_lepton, encode_lepton_verify,
};
use lepton_jpeg::{ExitCode, LeptonError};
use rstest::rstest;

/// handy function to compare two arrays, and print the first mismatch. Useful for debugging.
#[track_caller]
pub fn assert_eq_array<T: PartialEq + std::fmt::Debug>(a: &[T], b: &[T]) {
    use core::panic;

    if a.len() != b.len() {
        for i in 0..std::cmp::min(a.len(), b.len()) {
            assert_eq!(
                a[i],
                b[i],
                "length mismatch {},{} and first mismatch at offset {}",
                a.len(),
                b.len(),
                i
            );
        }
        panic!(
            "length mismatch {} and {}, but common prefix identical",
            a.len(),
            b.len()
        );
    } else {
        for i in 0..a.len() {
            assert_eq!(
                a[i],
                b[i],
                "length identical {}, but first mismatch at offset {}",
                a.len(),
                i
            );
        }
    }
}

/// reads a file from the images directory for testing or benchmarking purposes
pub fn read_file(filename: &str, ext: &str) -> Vec<u8> {
    use std::io::Read;

    let filename = std::path::Path::new(env!("WORKSPACE_ROOT"))
        .join("images")
        .join(filename.to_owned() + ext);
    let mut f = std::fs::File::open(filename).unwrap();

    let mut content = Vec::new();
    f.read_to_end(&mut content).unwrap();

    content
}

/// verifies that the decode will accept existing Lepton files and generate
/// exactly the same jpeg from them. Used to detect unexpected divergences in coding format.
#[rstest]
fn verify_decode(
    #[values(
        "android",
        "androidcrop",
        "androidcropoptions",
        "androidprogressive",
        "androidprogressive_garbage",
        "androidtrail",
        "colorswap",
        "cathedral_db_non_int",
        "cathedral_db_non_int_rustold",
        "gray2sf",
        "grayscale",
        "hq",
        "half_scan",
        "half_scan_rust55",
        "iphone",
        "iphonecity",
        "iphonecity_with_16KGarbage",
        "iphonecity_with_1MGarbage",
        "iphonecrop",
        "iphonecrop2",
        "iphoneprogressive",
        "iphoneprogressive2",
        "progressive_late_dht", // image has huffman tables that come very late which causes a verification failure 
        "out_of_order_dqt",     // image with quanatization table dqt that comes after image definition SOF
        "narrowrst",
        "nofsync",
        "slrcity",
        "slrhills",
        "slrindoor",
        "tiny",
        "trailingrst",
        "trailingrst2",
        "trunc",
        "truncbad",          // the lepton format is truncated and invalid
        "eof_and_trailingrst",    // the lepton format has a wrongly set unexpected eof and trailing rst
        "eof_and_trailinghdrdata" // the lepton format has a wrongly set unexpected eof and trailing header data
    )]
    file: &str,
) {
    use lepton_jpeg::DEFAULT_THREAD_POOL;

    println!("decoding {0:?}", file);

    let input = read_file(file, ".lep");
    let expected = read_file(file, ".jpg");

    let mut output = Vec::new();

    decode_lepton(
        &mut Cursor::new(input),
        &mut output,
        &EnabledFeatures::compat_lepton_vector_read(),
        &DEFAULT_THREAD_POOL,
    )
    .unwrap();

    assert_eq_array(&output, &expected);
}

/// verifies that the decode will accept existing Lepton files and generate
/// exactly the same jpeg from them. Used to detect unexpected divergences in coding format.
#[test]
fn verify_decode_scalar_overflow() {
    let file = "mathoverflow_scalar";

    println!("decoding {0:?}", file);

    let input = read_file(file, ".lep");
    let expected = read_file(file, ".jpg");

    let mut output = Vec::new();

    let features = EnabledFeatures::compat_lepton_scalar_read();

    decode_lepton(
        &mut Cursor::new(input),
        &mut output,
        &features,
        &DEFAULT_THREAD_POOL,
    )
    .unwrap();

    assert_eq_array(&output, &expected);
}

/// encodes as LEP and codes back to JPG to mostly test the encoder. Can't check against
/// the original LEP file since there's no guarantee they are binary identical (especially the zlib encoded part)
#[rstest]
fn verify_encode(
    #[values(
            "android",
            "androidcrop",
            "androidcropoptions",
            "androidprogressive",
            "androidprogressive_garbage",
            "androidtrail",
            "colorswap",
            "gray2sf",
            "grayscale",
            "hq",
            //"half_scan",
            "iphone",
            "iphonecity",
            "iphonecity_with_16KGarbage",
            "iphonecity_with_1MGarbage",
            "iphonecrop",
            "iphonecrop2",
            "iphoneprogressive",
            "iphoneprogressive2",
            "progressive_late_dht", // image has huffman tables that come very late which caused a verification failure 
            "out_of_order_dqt",
            //"narrowrst",
            //"nofsync",
            "slrcity",
            "slrhills",
            "slrindoor",
            "tiny",
            "trailingrst",
            "trailingrst2",
            "trunc",
        )]
    file: &str,
) {
    let input = read_file(file, ".jpg");

    let mut lepton = Vec::new();
    let mut output = Vec::new();

    encode_lepton(
        &mut Cursor::new(&input),
        &mut Cursor::new(&mut lepton),
        &EnabledFeatures::compat_lepton_vector_write(),
        &DEFAULT_THREAD_POOL,
    )
    .unwrap();

    decode_lepton(
        &mut Cursor::new(lepton),
        &mut output,
        &EnabledFeatures::compat_lepton_vector_read(),
        &DEFAULT_THREAD_POOL,
    )
    .unwrap();

    assert_eq_array(&input, &output);
}

/// these files are expected to fail encoding due to unsupported features or roundtrip errors
#[rstest]
fn verify_fail_encode(#[values("half_scan", "narrowrst", "nofsync")] file: &str) {
    let input = read_file(file, ".jpg");

    let result = encode_lepton_verify(
        &input,
        &EnabledFeatures::compat_lepton_vector_write(),
        &DEFAULT_THREAD_POOL,
    );

    assert!(result.is_err(), "encoding was expected to fail");
}

#[test]
fn verify_16bitmath() {
    // verifies that we can decode 16 bit encoded images from the C++ version
    {
        let input = read_file("mathoverflow_16", ".lep");
        let expected = read_file("mathoverflow", ".jpg");

        let mut output = Vec::new();

        let features = EnabledFeatures::compat_lepton_vector_read();

        decode_lepton(
            &mut Cursor::new(input),
            &mut output,
            &features,
            &DEFAULT_THREAD_POOL,
        )
        .unwrap();

        assert_eq_array(&output, &expected);
    }

    // verify that we can decode the one generated by the Rust version
    {
        let input = read_file("mathoverflow_32", ".lep");
        let expected = read_file("mathoverflow", ".jpg");

        let mut output = Vec::new();

        let mut features = EnabledFeatures::compat_lepton_vector_read();
        features.use_16bit_dc_estimate = false;

        decode_lepton(
            &mut Cursor::new(input),
            &mut output,
            &features,
            &DEFAULT_THREAD_POOL,
        )
        .unwrap();

        assert_eq_array(&output, &expected);
    }
}

/// encodes as LEP and codes back to JPG to mostly test the encoder. Can't check against
/// the original LEP file since there's no guarantee they are binary identical (especially the zlib encoded part)
#[rstest]
fn verify_encode_verify(#[values("slrcity")] file: &str) {
    let input = read_file(file, ".jpg");

    encode_lepton_verify(
        &input[..],
        &EnabledFeatures::compat_lepton_vector_write(),
        &DEFAULT_THREAD_POOL,
    )
    .unwrap();
}

fn assert_exception<T>(expected_error: ExitCode, result: Result<T, LeptonError>) {
    match result {
        Ok(_) => panic!("failure was expected"),
        Err(e) => {
            assert_eq!(expected_error, e.exit_code(), "unexpected error {0:?}", e);
        }
    }
}

#[rstest]
fn verify_encode_verify_fail(#[values("mismatch_encode")] file: &str) {
    let input = read_file(file, ".jpg");

    assert_exception(
        ExitCode::VerificationContentMismatch,
        encode_lepton_verify(
            &input[..],
            &EnabledFeatures::compat_lepton_vector_write(),
            &DEFAULT_THREAD_POOL,
        ),
    );
}

/// ensures we error out if we have the progressive flag disabled
#[rstest]
fn verify_encode_progressive_false(
    #[values("androidprogressive", "iphoneprogressive", "iphoneprogressive2")] file: &str,
) {
    let input = read_file(file, ".jpg");
    let mut lepton = Vec::new();
    assert_exception(
        ExitCode::ProgressiveUnsupported,
        encode_lepton(
            &mut Cursor::new(&input),
            &mut Cursor::new(&mut lepton),
            &EnabledFeatures {
                progressive: false,
                ..EnabledFeatures::compat_lepton_vector_write()
            },
            &DEFAULT_THREAD_POOL,
        ),
    );
}

/// non-optimally zero length encoding progressive JPEGs cannot be recreated properly since the encoder always tries to create the longest zero runs
/// legally allowed given the available huffman codes.
#[test]
fn verify_nonoptimal() {
    let input = read_file("nonoptimalprogressive", ".jpg");
    let mut lepton = Vec::new();
    assert_exception(
        ExitCode::UnsupportedJpeg,
        encode_lepton(
            &mut Cursor::new(&input),
            &mut Cursor::new(&mut lepton),
            &EnabledFeatures::compat_lepton_vector_write(),
            &DEFAULT_THREAD_POOL,
        ),
    );
}

/// processing of images with zeros in DQT tables may lead to divide-by-zero, therefore these images are not supported
#[test]
fn verify_encode_image_with_zeros_in_dqt_tables() {
    let input = read_file("zeros_in_dqt_tables", ".jpg");
    let mut lepton = Vec::new();

    assert_exception(
        ExitCode::UnsupportedJpegWithZeroIdct0,
        encode_lepton(
            &mut Cursor::new(&input),
            &mut Cursor::new(&mut lepton),
            &EnabledFeatures::compat_lepton_vector_write(),
            &DEFAULT_THREAD_POOL,
        ),
    );
}

/// tests all previous fuzzing failures to ensure they remain fixed. This requires them to be
/// checked into the repository under the fuzz/artifacts/fuzz_target_1 directory as crash-xxxx files
#[test]
fn test_previous_fuzz_failures() {
    for entry in read_dir(
        Path::new(env!("WORKSPACE_ROOT"))
            .join("fuzz")
            .join("artifacts")
            .join("fuzz_target_1"),
    )
    .unwrap()
    {
        let entry = entry.unwrap();
        let path = entry.path();

        // see if it starts with crash-
        let filename = path.file_name().unwrap().to_str().unwrap();
        if !filename.starts_with("crash-") {
            continue;
        }

        println!(
            "testing fuzz failure reproduction for file {}",
            path.display()
        );

        let data = std::fs::read(path).unwrap();
        test_fuzz_failure(&data);
    }

    /// mirrors what we do for fuzz testing so that we can reproduce failures found by the fuzzer
    /// and ensure that they remain fixed
    fn test_fuzz_failure(data: &[u8]) {
        let mut output = Vec::new();

        let use_16bit = match data.len() % 2 {
            0 => false,
            _ => true,
        };
        let accept_invalid_dht = match (data.len() / 2) % 2 {
            0 => false,
            _ => true,
        };

        // keep the jpeg dimensions small otherwise the fuzzer gets really slow
        let features = EnabledFeatures {
            progressive: true,
            reject_dqts_with_zeros: true,
            max_jpeg_height: 1024,
            max_jpeg_width: 1024,
            use_16bit_dc_estimate: use_16bit,
            use_16bit_adv_predict: use_16bit,
            accept_invalid_dht: accept_invalid_dht,
            ..EnabledFeatures::compat_lepton_vector_write()
        };

        let r;
        {
            let mut writer = Cursor::new(&mut output);

            r = encode_lepton(
                &mut Cursor::new(&data),
                &mut writer,
                &features,
                &DEFAULT_THREAD_POOL,
            );
        }

        let mut original = Vec::new();

        match r {
            Ok(_) => {
                let _ = decode_lepton(
                    &mut Cursor::new(&output),
                    &mut original,
                    &features,
                    &DEFAULT_THREAD_POOL,
                );
            }
            Err(_) => {}
        }
    }
}


================================================
FILE: tests/verifycompression.cmd
================================================
@echo off
..\target\release\lepton_jpeg_util.exe %1 nul > error.txt
if errorlevel 1 (
echo %1 failed %errorlevel%
echo ------------ >> failedlog.txt
echo %1 failed %errorlevel% >> failedlog.txt
type error.txt >> failedlog.txt
)

================================================
FILE: tests/verifydir.cmd
================================================
@echo off
setlocal enabledelayedexpansion
for  /r %1 %%f in (*.jpg)  do call verifycompression.cmd ^"%%f"^

================================================
FILE: util/Cargo.toml
================================================
[package]
name = "lepton_jpeg_util"
version.workspace = true
edition = "2024"
authors = ["Kristof Roomp <kristofr@microsoft.com>"]

[features]
default = []

[dependencies]
lepton_jpeg = { path = "../lib" }
pico-args = "0.5"
log = "0.4"
simple_logger ="5.0"
rayon = "1" 
uuid = { version = "1.19", features = ["v4"] }
winpipe = "0.1"
msvc_spectre_libs = "0.1.3"

[[bin]]
name="lepton_jpeg_util"


================================================
FILE: util/src/main.rs
================================================
/*---------------------------------------------------------------------------------------------
 *  Copyright (c) Microsoft Corporation. All rights reserved.
 *  Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information.
 *  This software incorporates material from third parties. See NOTICE.txt for details.
 *--------------------------------------------------------------------------------------------*/

use std::borrow::Cow;
use std::ffi::OsStr;
use std::fs::{File, OpenOptions};
use std::io::{Cursor, IsTerminal, Read, Seek, Write, stdin, stdout};
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::time::{Duration, Instant};

use lepton_jpeg::{
    CpuTimeMeasure, DEFAULT_THREAD_POOL, EnabledFeatures, ExitCode, LeptonError, LeptonThreadPool,
    LeptonThreadPriority, Metrics, SimpleThreadPool, SingleThreadPool, StreamPosition,
    decode_lepton, dump_jpeg, encode_lepton, encode_lepton_verify,
};
use log::{error, info};
use simple_logger::SimpleLogger;

use crate::verifydir::corrupt_data_if_enabled;

mod verifydir;

static LOW_PRIORITY_THREAD_POOL: SimpleThreadPool =
    SimpleThreadPool::new(LeptonThreadPriority::Low);
static HIGH_PRIORITY_THREAD_POOL: SimpleThreadPool =
    SimpleThreadPool::new(LeptonThreadPriority::High);

#[derive(Copy, Clone, Debug)]
enum FileType {
    Jpeg,
    Lepton,
}

fn parse_i32(s: &str) -> Result<i32, &'static str> {
    s.parse().map_err(|_| "not a number")
}

fn parse_u32(s: &str) -> Result<u32, &'static str> {
    s.parse().map_err(|_| "not a number")
}

fn parse_u64(s: &str) -> Result<u64, &'static str> {
    s.parse().map_err(|_| "not a number")
}

fn parse_path(s: &OsStr) -> Result<PathBuf, &'static str> {
    Ok(PathBuf::from(s))
}

fn override_if<T>(
    pargs: &mut pico_args::Arguments,
    name: &'static str,
    parse: fn(&str) -> Result<T, &'static str>,
    value: &mut T,
) -> Result<(), pico_args::Error> {
    if let Some(v) = pargs.opt_value_from_fn(name, parse)? {
        *value = v;
    }
    Ok(())
}

struct UtilError(LeptonError);

impl UtilError {
    fn message(&self) -> &str {
        self.0.message()
    }

    fn exit_code(&self) -> ExitCode {
        self.0.exit_code()
    }
}

impl From<pico_args::Error> for UtilError {
    #[track_caller]
    fn from(e: pico_args::Error) -> Self {
        let mut e = LeptonError::new(ExitCode::SyntaxError, e.to_string());
        e.add_context();
        UtilError(e)
    }
}

impl From<LeptonError> for UtilError {
    #[track_caller]
    fn from(e: LeptonError) -> Self {
        UtilError(e)
    }
}

impl From<std::io::Error> for UtilError {
    #[track_caller]
    fn from(e: std::io::Error) -> Self {
        UtilError(e.into())
    }
}

struct RecordStreamPosition<W: Write> {
    writer: W,
    position: u64,
}

impl<W: Write> Write for RecordStreamPosition<W> {
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        let n = self.writer.write(buf)?;
        self.position += n as u64;
        Ok(n)
    }

    fn flush(&mut self) -> std::io::Result<()> {
        self.writer.flush()
    }
}

impl<W: Write> StreamPosition for RecordStreamPosition<W> {
    fn position(&mut self) -> u64 {
        self.position
    }
}

// wrap main so that errors get printed nicely without a panic
fn main_with_result() -> Result<(), UtilError> {
    let mut pargs = pico_args::Arguments::from_env();

    let mut enabled_features = EnabledFeatures::compat_lepton_vector_read();
    let mut filter_level = log::LevelFilter::Info;

    if pargs.contains(["-h", "--help"]) {
        println!(
"lepton_jpeg_util - a fast JPEG compressor

Usage: lepton_jpeg_util [options] inputfile [outputfile]

Options:
    --iter <n>              number of iterations to run
    --dump                  dump the JPEG file
    --all                   dump includes the scan lines
    --cppverify <exe path>  verify the output with the C++ decoder
    --overwrite             overwrite the output file
    --corrupt <seed>        randomly corrupt the input file (for testing)
    --quiet                 suppress all output
    --noverify              do not verify the output
    --max-width <n>         maximum width of the JPEG file
    --max-height <n>        maximum height of the JPEG file
    --max-jpeg-file-size <n> maximum size of the JPEG file
    --threads <n>           maximum number of threads to use
    --rejectprogressive     reject progressive JPEG files
    --rejectdqtswithzeros   reject DQT tables with zeros
    --rejectinvalidhuffman  reject invalid Huffman tables
    --use32bitdc            use 32 bit DC estimate
    --use32bitadv           use 32 bit advanced prediction
    --useleptonscalar       use the scalar version of the encoder
    --highpriority          run on p-cores
    --lowpriority           run on e-cores
    --version               print the version
    --help                  print this help message
    --verifydir             Recursively verify all files in a directory can be compressed and decompressed
");
        return Ok(());
    }

    let cppverify: Option<PathBuf> = pargs.opt_value_from_os_str("--cppverify", parse_path)?;
    let verify_dir = pargs.opt_value_from_os_str("--verifydir", parse_path)?;

    let iterations = pargs.opt_value_from_fn("--iter", parse_i32)?.unwrap_or(1);
    let dump = pargs.contains("--dump");
    let dumpall = pargs.contains("--dumpall");
    let verify = !pargs.contains("--noverify");
    let overwrite = pargs.contains("--overwrite");
    let mut corrupt = pargs.opt_value_from_fn("--corrupt", parse_u64)?;

    if pargs.contains("--quiet") {
        filter_level = log::LevelFilter::Warn;
    }

    override_if(
        &mut pargs,
        "--max-width",
        parse_u32,
        &mut enabled_features.max_jpeg_width,
    )?;

    override_if(
        &mut pargs,
        "--max-height",
        parse_u32,
        &mut enabled_features.max_jpeg_height,
    )?;

    override_if(
        &mut pargs,
        "--threads",
        parse_u32,
        &mut enabled_features.max_partitions,
    )?;

    override_if(
        &mut pargs,
        "--rejectprogressive",
        |_| Ok(false),
        &mut enabled_features.progressive,
    )?;

    override_if(
        &mut pargs,
        "--rejectdqtswithzeros",
        |_| Ok(true),
        &mut enabled_features.reject_dqts_with_zeros,
    )?;

    override_if(
        &mut pargs,
        "--rejectinvalidhuffman",
        |_| Ok(false),
        &mut enabled_features.accept_invalid_dht,
    )?;

    override_if(
        &mut pargs,
        "--max-jpeg-file-size",
        parse_u32,
        &mut enabled_features.max_jpeg_file_size,
    )?;

    if pargs.contains("--version") {
        println!(
            "compiled library Lepton version {}",
            lepton_jpeg::get_version_string()
        );
    }

    if pargs.contains("--use32bitdc") {
        enabled_features.use_16bit_dc_estimate = false;
    }
    if pargs.contains("--use32bitadv") {
        enabled_features.use_16bit_adv_predict = false;
    }
    if pargs.contains("--useleptonscalar") {
        // use both these options if you are trying to read a file that was encoded with the scalar version of the C++ encoder
        // sadly one old version of the Rust encoder used use_16bit_dc_estimate=false, use_16bit_adv_predict=true
        // the latest version of the encoder put these options in the header so we ignore this if the file specifies it
        enabled_features.use_16bit_adv_predict = false;
        enabled_features.use_16bit_dc_estimate = false;
    }

    let singlethreaded = pargs.contains("--singlethreaded");
    let highpriority = pargs.contains("--highpriority");
    let lowpriority = pargs.contains("--lowpriority");

    let thread_pool: &dyn LeptonThreadPool = if highpriority {
        // used to force to run on p-cores, make sure this and
        // any threadpool threads are set to the highest priority
        &HIGH_PRIORITY_THREAD_POOL
    } else if lowpriority {
        // used to force to run on e-cores, make sure this and
        // any threadpool threads are set to the lowest priority
        &LOW_PRIORITY_THREAD_POOL
    } else {
        if singlethreaded {
            &SingleThreadPool {}
        } else {
            &DEFAULT_THREAD_POOL
        }
    };

    let filenames = pargs.finish();

    for i in filenames.iter() {
        // no other options should be specified only the free standing filenames
        if i.to_string_lossy().starts_with("-") {
            return Err(
                LeptonError::new(ExitCode::SyntaxError, format!("unknown option {:?}", i)).into(),
            );
        }
    }

    // only output the log if we are connected to a console (otherwise if there is redirection we would corrupt the file)
    if stdout().is_terminal() {
        SimpleLogger::new().with_level(filter_level).init().unwrap();
    }

    // if we are verifying a directory, then we need to recursively verify all files in the directory
    if let Some(verify_dir) = verify_dir {
        verifydir::verify_dir(
            verify_dir.as_path(),
            cppverify.as_ref().unwrap(),
            &mut corrupt,
        )?;
        return Ok(());
    }

    if dump {
        let mut file_in = File::open(filenames[0].as_os_str()).unwrap();

        let mut contents = Vec::new();
        file_in.read_to_end(&mut contents).unwrap();
        dump_jpeg(&contents, dumpall, &enabled_features).unwrap();
        return Ok(());
    }

    let mut input_data = Vec::new();
    if filenames.len() != 2 {
        if stdout().is_terminal() || stdin().is_terminal() {
            return Err(LeptonError::new(
                ExitCode::SyntaxError,
                "source and destination filename are needed or input needs to be redirected",
            )
            .into());
        }

        // special case for piped input, stream output data as we process it instead of buffering it all
        let mut data = Vec::new();
        std::io::stdin().read_to_end(&mut data)?;

        let mut cursor = Cursor::new(&data);

        let mut output = RecordStreamPosition {
            writer: std::io::stdout(),
            position: 0,
        };

        match id_file_type(&data)? {
            FileType::Jpeg => {
                lepton_jpeg::encode_lepton(
                    &mut cursor,
                    &mut output,
                    &enabled_features,
                    thread_pool,
                )?;
            }
            FileType::Lepton => {
                lepton_jpeg::decode_lepton(
                    &mut cursor,
                    &mut output,
                    &enabled_features,
                    thread_pool,
                )?;
            }
        }

        return Ok(());
    } else {
        let mut file_in = File::open(filenames[0].as_os_str())
            .map_err(|e| LeptonError::new(ExitCode::FileNotFound, e.to_string()))?;

        file_in.read_to_end(&mut input_data)?;
    }

    if input_data.len() < 2 {
        return Err(LeptonError::new(ExitCode::BadLeptonFile, "ERROR input file too small").into());
    }

    let mut metrics;
    let mut output_data;

    let mut overall_cpu = Duration::ZERO;

    let mut current_iteration = 0;

    // see what file type we have
    let file_type = id_file_type(&input_data)?;

    // get a writable version of the input data so we can corrupt it if the user wants to
    let mut writable_input_data = Cow::from(&input_data);

    loop {
        let thread_cpu = CpuTimeMeasure::new();
        let walltime = Instant::now();

        corrupt_data_if_enabled(&mut corrupt, &mut writable_input_data.to_mut());

        // do the encoding/decoding, if we got an error and were corrupting the file, then restore the
        // original data and continue so we can try corrupting the file in different ways
        // per iteration
        match do_work(
            file_type,
            verify,
            &writable_input_data,
            &enabled_features,
            thread_pool,
        ) {
            Err(e) => {
                error!("error {0}", e);

                // if we corrupted the image, then restore and continue running
                if corrupt.is_some() {
                    // reset the input data not be be corrupt anymore
                    writable_input_data = Cow::from(&input_data);
                    output_data = Vec::new();
                    metrics = Metrics::default();
                } else {
                    return Err(e.into());
                }
            }

            Ok((data, m)) => {
                output_data = data;
                metrics = m;
            }
        }

        let localthread = thread_cpu.elapsed();
        let workers = metrics.get_cpu_time_worker_time();

        info!(
            "Main thread CPU: {}ms, Worker thread CPU: {} ms, walltime: {} ms",
            localthread.as_millis(),
            workers.as_millis(),
            walltime.elapsed().as_millis()
        );

        overall_cpu += localthread + workers;

        current_iteration += 1;
        if current_iteration >= iterations {
            break;
        }
    }

    if filenames.len() != 2 {
        std::io::stdout().write_all(&output_data[..])?
    } else {
        let output_filename = filenames[1].as_os_str();

        let mut fileout = OpenOptions::new()
            .write(true)
            .create(overwrite)
            .create_new(!overwrite)
            .open(output_filename)?;

        // ignore if this failed (etc on a pipe)
        let _ = fileout.set_len(output_data.len() as u64);
        fileout.write_all(&output_data[..])?;
        drop(fileout);

        // what we do is take the lepton output, and see if it recreates the input using the
        // CPP version of the encoder/decoder
        if let Some(cpp_path) = cppverify {
            execute_cpp_verify(cpp_path.as_path(), output_filename, &writable_input_data)?;
        }
    }

    if iterations > 1 {
        info!(
            "Overall average CPU consumed per iteration {0}ms ",
            overall_cpu.as_millis() / (iterations as u128)
        );
    }

    Ok(())
}

fn id_file_type(input_data: &[u8]) -> Result<FileType, LeptonError> {
    Ok(if input_data[0] == 0xff && input_data[1] == 0xd8 {
        FileType::Jpeg
    } else if input_data[0] == 0xcf && input_data[1] == 0x84 {
        FileType::Lepton
    } else {
        return Err(LeptonError::new(
            ExitCode::BadLeptonFile,
            "ERROR input file is not a valid JPEG or Lepton file",
        )
        .into());
    })
}

fn execute_cpp_verify(
    cpp_executable: &Path,
    compressed_file: &OsStr,
    original_contents: &[u8],
) -> Result<(), LeptonError> {
    let (output, exit_code, stderr) =
        call_executable_with_input(cpp_executable, compressed_file).unwrap();

    if exit_code != 0 {
        log::error!("cpp exit code: {}", exit_code);

        return Err(LeptonError::new(
            ExitCode::ExternalVerificationFailed,
            format!(
                "cpp verify failed with exit code {0} stderr: {1}",
                exit_code, stderr
            ),
        ))?;
    }
    if output[..].len() != original_contents.len() {
        return Err(LeptonError::new(
            ExitCode::ExternalVerificationFailed,
            format!(
                "cpp verify failed with different length {0} != {1}",
                output[..].len(),
                original_contents.len()
            ),
        ));
    }
    if output[..] != original_contents[..] {
        return Err(LeptonError::new(
            ExitCode::ExternalVerificationFailed,
            "verify failed with different data",
        )
        .into());
    }
    log::info!("verify succeeded with cpp version");
    Ok(())
}

/// does the actual encoding/decoding work
fn do_work(
    file_type: FileType,
    verify: bool,
    input_data: &[u8],
    enabled_features: &EnabledFeatures,
    thread_pool: &dyn LeptonThreadPool,
) -> Result<(Vec<u8>, Metrics), LeptonError> {
    let metrics;
    let mut output;

    match file_type {
        FileType::Jpeg => {
            if verify {
                (output, metrics) =
                    encode_lepton_verify(input_data, enabled_features, thread_pool)?;
            } else {
                let mut reader = Cursor::new(input_data);
                output = Vec::with_capacity(input_data.len());
                let mut writer = Cursor::new(&mut output);

                metrics = encode_lepton(&mut reader, &mut writer, enabled_features, thread_pool)?
            }

            info!(
                "compressed input {0}, output {1} bytes (compression = {2:.1}%)",
                input_data.len(),
                output.len(),
                ((input_data.len() as f64) / (output.len() as f64) - 1.0) * 100.0
            );
        }
        FileType::Lepton => {
            let mut reader = Cursor::new(&input_data);

            output = Vec::with_capacity(input_data.len());

            metrics = decode_lepton(&mut reader, &mut output, &enabled_features, thread_pool)?;
        }
    }

    Ok((output, metrics))
}

/// internal debug utility used to figure out where in the output the JPG diverged if there was a coding error writing out the JPG
struct VerifyWriter<W> {
    output: W,
    good_data: Vec<u8>,
    offset: usize,
}

impl<W> VerifyWriter<W> {
    // used for debugging
    #[allow(dead_code)]
    pub fn new<R: Read>(output: W, mut reader: R) -> Self {
        let mut r = VerifyWriter {
            output,
            offset: 0,
            good_data: Vec::new(),
        };
        reader.read_to_end(&mut r.good_data).unwrap();
        r
    }
}
impl<W: Write + Seek> Seek for VerifyWriter<W> {
    fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
        self.output.seek(pos)
    }
}

impl<W: Write + Seek> Write for VerifyWriter<W> {
    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
        let goodslice = &self.good_data[self.offset..self.offset + buf.len()];

        if goodslice[..] != buf[..] {
            for i in 0..goodslice.len() {
                if goodslice[i] != buf[i] {
                    eprintln!("at position {0}", self.output.stream_position()? + i as u64);

                    self.output.write_all(buf)?;
                    self.output.flush()?;
                    panic!("mismatched file!");
                }
            }
        }

        self.offset += buf.len();
        self.output.write_all(buf)?;
        return Ok(buf.len());
    }

    fn flush(&mut self) -> std::io::Result<()> {
        return self.output.flush();
    }
}

fn main() {
    match main_with_result() {
        Ok(_) => {}
        Err(e) => {
            eprintln!(
                "error code: {0} {1} {2}",
                e.exit_code(),
                e.exit_code().as_integer_error_code(),
                e.message()
            );
            std::process::exit(e.exit_code().as_integer_error_code());
        }
    }
}

/// calls the CPP version of the encoder/decoder to verify the output of the Rust version
pub fn call_executable_with_input(
    cpp_executable: &Path,
    input_filename: &OsStr,
) -> Result<(Vec<u8>, i32, String), LeptonError> {
    // temporary file to store the output of the cpp version so we can
    // compare it with the rust version

    let temp_filename_buf = std::env::temp_dir().join("lepton_jpeg_util_cpp_recreate.jpg");
    let temp_filename = temp_filename_buf.as_os_str();

    // delete if already exists
    let _ = std::fs::remove_file(temp_filename);

    log::info!(
        "verifying input filename with CPP {:?} with {:?}",
        temp_filename,
        cpp_executable
    );

    // Spawn the command
    let child = Command::new(cpp_executable)
        .arg(input_filename)
        .arg(temp_filename)
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()?;

    // Wait for the child process to exit and collect output
    let output = child.wait_with_output()?;

    let mut file_in = File::open(&temp_filename).unwrap();
    let mut contents = Vec::new();
    file_in.read_to_end(&mut contents).unwrap();

    // remove the temporary file
    let _ = std::fs::remove_file(temp_filename);

    // Extract the stdout, stderr, and exit status
    let stderr = String::from_utf8_lossy(&output.stderr).to_string();
    let exit_code = output.status.code().unwrap_or(-10000); // Handle the case where exit code is None

    Ok((contents, exit_code, stderr))
}


================================================
FILE: util/src/verifydir.rs
================================================
use std::ffi::OsStr;
use std::fs::{self, ReadDir};
use std::io::Cursor;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use uuid::Uuid;

use lepton_jpeg::{EnabledFeatures, LeptonError};

pub struct RecursiveFiles {
    stack: Vec<ReadDir>,
}

impl RecursiveFiles {
    pub fn new(root: impl AsRef<Path>) -> std::io::Result<Self> {
        Ok(Self {
            stack: vec![fs::read_dir(root)?],
        })
    }
}

impl Iterator for RecursiveFiles {
    type Item = PathBuf;

    fn next(&mut self) -> Option<Self::Item> {
        while let Some(dir) = self.stack.last_mut() {
            match dir.next() {
                Some(Ok(entry)) => {
                    let path = entry.path();
                    match entry.file_type() {
                        Ok(ft) if ft.is_dir() => {
                            if let Ok(rd) = fs::read_dir(&path) {
                                self.stack.push(rd);
                            }
                        }
                        Ok(ft) if ft.is_file() => {
                            return Some(path);
                        }
                        _ => {}
                    }
                }
                Some(Err(_)) => continue,
                None => {
                    self.stack.pop();
                }
            }
        }
        None
    }
}

struct RayonPool {}

impl lepton_jpeg::LeptonThreadPool for RayonPool {
    fn max_parallelism(&self) -> usize {
        rayon::current_num_threads()
    }

    fn run(&self, f: Box<dyn FnOnce() + Send + 'static>) {
        rayon::spawn(f);
    }
}

/// randomly corrupts data if there is a seed
pub fn corrupt_data_if_enabled(seed: &mut Option<u64>, input_data: &mut Vec<u8>) {
    fn simple_lcg(seed: &mut u64) -> u64 {
        let r = seed.wrapping_mul(6364136223846793005) + 1;
        *seed = r;
        r
    }

    if let Some(seed) = seed {
        if input_data.len() > 0 {
            let op = simple_lcg(seed);
            let r = simple_lcg(seed) as usize % input_data.len();

            match op % 5 {
                0 => {
                    // flip bit
                    let bitnumber = simple_lcg(seed) as usize % 8;
                    input_data[r] ^= 1 << bitnumber;
                }
                1 => {
                    // truncate file
                    input_data.truncate(r);
                }
                2 => {
                    // insert random byte
                    let random_byte = (simple_lcg(seed) & 0xFF) as u8;
                    input_data.insert(r, random_byte);
                }
                3 => {
                    // delete byte
                    if input_data.len() > 1 {
                        input_data.remove(r);
                    }
                }
                4 => {
                    // truncate by 1
                    if input_data.len() > 1 {
                        input_data.truncate(input_data.len() - 1);
                    }
                }
                _ => {
                    // do nothing
                }
            }
        }
    }
}

pub fn verify_dir(
    root_path: &Path,
    cpp_executable: &Path,
    corruption_seed: &mut Option<u64>,
) -> Result<(), LeptonError> {
    let iter = RecursiveFiles::new(root_path).unwrap();

    iter.for_each(|file_path| {
        if file_path.extension().and_then(|s| s.to_str()) != Some("jpg") {
            return;
        }
        call_executable_with_input(cpp_executable, file_path.as_os_str(), corruption_seed);
    });
    Ok(())
}

/// calls the CPP version of the encoder/decoder to verify the output of the Rust version
pub fn call_executable_with_input(
    cpp_executable: &Path,
    input_filename: &OsStr,
    corruption_seed: &mut Option<u64>,
) {
    let mut input_data = std::fs::read(input_filename).unwrap();
    corrupt_data_if_enabled(corruption_seed, &mut input_data);

    // write to temporary file with potential corruption
    let temp_filename_input =
        std::env::temp_dir().join(format!("lepton_jpeg_util_cpp_{}.jpg", Uuid::new_v4()));
    std::fs::write(&temp_filename_input, &input_data).unwrap();

    let temp_filename_output =
        std::env::temp_dir().join(format!("lepton_jpeg_util_cpp_{}.lep", Uuid::new_v4()));

    // delete output if already exists
    let _ = std::fs::remove_file(&temp_filename_output);

    // Spawn the command
    let child = Command::new(cpp_executable)
        .arg(&temp_filename_input)
        .arg(&temp_filename_output)
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .unwrap();

    // Wait for the child process to exit and collect output
    let output = child.wait_with_output().unwrap();

    // Extract the stdout, stderr, and exit status
    let stderr = String::from_utf8_lossy(&output.stderr).to_string();
    let exit_code = output.status.code().unwrap_or(-10000); // Handle the case where exit code is None

    if exit_code != 0 {
        log::info!(
            "CPP executable failed for file {:?} with exit code {}: {}",
            input_filename,
            exit_code,
            stderr
        );
    } else {
        let cpp_lepton_data = fs::read(&temp_filename_output).unwrap();

        let mut writer = Vec::new();

        if let Err(e) = lepton_jpeg::decode_lepton(
            &mut Cursor::new(&cpp_lepton_data),
            &mut writer,
            &EnabledFeatures::compat_lepton_vector_read(),
            &RayonPool {},
        ) {
            panic!(
                "Error decoding CPP output for file {}: {} {} seed:{:?}",
                input_filename.to_string_lossy(),
                e,
                stderr,
                corruption_seed
            );
        }

        if writer != input_data {
            println!(
                "Original size: {}, Re-coded: {}",
                input_data.len(),
                writer.len()
            );

            fs::write("r_corrupted_input.jpg", &input_data).unwrap();
            fs::write("r_cpp_lepton.lep", &cpp_lepton_data).unwrap();
            fs::write("r_rust_recorded.jpg", &writer).unwrap();

            panic!(
                "Verification failed for file {}: output does not match original {} {} seed:{:?}",
                input_filename.to_string_lossy(),
                exit_code,
                stderr,
                corruption_seed
            );
        }

        log::info!("Verified file: {}", input_filename.to_string_lossy());
    }

    _ = std::fs::remove_file(&temp_filename_input);
    _ = std::fs::remove_file(&temp_filename_output);
}